Machine learning work steps through python

Import all useable library

In [1]:
# import relevant modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as sn
import scipy.stats as stats
from matplotlib.backends.backend_pdf import PdfPages
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Settings
pd.set_option('display.max_columns', None)
np.set_printoptions(threshold=np.nan)
np.set_printoptions(precision=3)
sns.set(style="darkgrid")
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

Load the Data set.

In [2]:
custdata_df = pd.read_excel("Data Set.xlsx", sheetname="customer_dbase")
In [3]:
custdata_df.sample(5)
Out[3]:
custid region townsize gender age agecat birthmonth ed edcat jobcat union employ empcat retire income lninc inccat debtinc creddebt lncreddebt othdebt lnothdebt default jobsat marital spoused spousedcat reside pets pets_cats pets_dogs pets_birds pets_reptiles pets_small pets_saltfish pets_freshfish homeown hometype address addresscat cars carown cartype carvalue carcatvalue carbought carbuy commute commutecat commutetime commutecar commutemotorcycle commutecarpool commutebus commuterail commutepublic commutebike commutewalk commutenonmotor telecommute reason polview polparty polcontrib vote card cardtype cardbenefit cardfee cardtenure cardtenurecat card2 card2type card2benefit card2fee card2tenure card2tenurecat cardspent card2spent active bfast tenure churn longmon lnlongmon longten lnlongten tollfree tollmon lntollmon tollten lntollten equip equipmon lnequipmon equipten lnequipten callcard cardmon lncardmon cardten lncardten wireless wiremon lnwiremon wireten lnwireten multline voice pager internet callid callwait forward confer ebill owntv hourstv ownvcr owndvd owncd ownpda ownpc ownipod owngame ownfax news response_01 response_02 response_03
394 8512-KZJXAA-A34 5 2.0 0 58 5 July 13 2 3 0 13 4 0 54 3.988984 3 16.4 1.186704 0.171180 7.669296 2.037225 0 4 0 -1 -1 1 7 0 0 0 0 0 0 7 1 1 25 4 3 1 0 28.0 2 0 1 1 1 23.0 1 0 1 1 0 0 0 0 0 0 9 5 0 0 1 3 3 2 0 18 5 2 2 1 0 14 4 226.22 62.82 1 3 43 0 9.10 2.208274 399.10 5.989212 1 37.25 3.617652 1616.55 7.388050 0 0.0 NaN 0.00 NaN 1 32.50 3.481240 1395.0 7.240650 0 0.0 NaN 0.00 NaN 1 0 0 0 1 1 1 1 0 1 18 1 1 1 0 1 1 0 0 1 0 0 0
218 8477-FURXBL-V98 1 1.0 0 47 4 March 15 3 3 1 8 3 0 45 3.806662 2 19.7 2.526525 0.926845 6.338475 1.846638 0 5 0 -1 -1 1 2 1 1 0 0 0 0 0 1 1 24 4 1 1 1 25.3 2 0 1 3 2 14.0 0 0 1 0 0 0 0 0 0 0 9 4 0 0 0 4 1 4 0 4 2 3 2 3 0 6 3 42.69 12.08 0 3 7 0 4.05 1.398717 27.50 3.314186 0 0.00 NaN 0.00 NaN 0 0.0 NaN 0.00 NaN 0 0.00 NaN 0.0 NaN 0 0.0 NaN 0.00 NaN 0 0 0 2 0 0 0 0 0 1 17 1 1 1 0 1 0 1 0 0 0 0 0
4015 0409-MMPGJY-ECA 1 1.0 1 60 5 January 12 2 5 1 27 5 0 79 4.369448 4 7.7 1.344343 0.295905 4.738657 1.555754 0 2 0 -1 -1 1 2 2 0 0 0 0 0 0 1 1 9 3 0 -1 -1 -1.0 -1 -1 1 2 1 15.0 0 1 0 1 0 0 0 0 0 0 9 6 1 0 1 1 1 2 0 19 5 4 2 2 0 11 4 1249.83 881.78 0 3 30 0 8.15 2.098018 271.55 5.604146 1 17.00 2.833213 555.60 6.320049 0 0.0 NaN 0.00 NaN 1 5.50 1.704748 155.0 5.043425 0 0.0 NaN 0.00 NaN 0 0 0 0 0 1 1 1 0 1 14 1 1 1 0 0 0 0 0 0 1 0 0
4407 9411-CNRRPX-2HW 3 2.0 0 56 5 July 17 4 1 0 16 5 0 219 5.389072 5 7.0 5.212200 1.651002 10.117800 2.314296 0 2 0 -1 -1 1 5 0 5 0 0 0 0 0 0 2 19 4 2 1 0 46.1 3 0 1 1 1 15.0 1 1 0 0 1 0 0 0 0 0 9 3 0 0 1 3 1 4 0 33 5 1 1 2 0 23 5 678.79 96.49 0 3 63 0 23.50 3.157000 1508.30 7.318738 0 0.00 NaN 0.00 NaN 1 38.2 3.642836 2324.95 7.751454 1 38.50 3.650658 2400.0 7.783224 1 23.7 3.165475 1406.55 7.248895 1 0 1 1 1 1 1 1 1 1 12 1 1 1 0 1 0 1 1 0 0 0 1
1044 1890-WDOXSL-5H1 2 3.0 0 20 2 May 10 1 6 1 3 2 0 19 2.944439 1 3.0 0.369930 -0.994441 0.200070 -1.609088 0 2 1 11 1 4 0 0 0 0 0 0 0 0 0 2 0 1 2 1 0 9.6 1 0 1 1 1 19.0 1 0 1 0 0 0 0 0 0 0 9 5 0 0 0 3 3 3 0 2 2 4 3 1 0 2 2 362.36 163.95 1 3 24 0 3.75 1.321756 69.50 4.241327 0 0.00 NaN 0.00 NaN 0 0.0 NaN 0.00 NaN 1 7.25 1.981001 175.0 5.164786 0 0.0 NaN 0.00 NaN 0 0 0 0 0 1 1 1 0 1 22 0 1 1 0 0 0 0 0 0 0 0 0
In [4]:
# Find column information in the dataframe.
custdata_df.columns
Out[4]:
Index(['custid', 'region', 'townsize', 'gender', 'age', 'agecat', 'birthmonth',
       'ed', 'edcat', 'jobcat',
       ...
       'owncd', 'ownpda', 'ownpc', 'ownipod', 'owngame', 'ownfax', 'news',
       'response_01', 'response_02', 'response_03'],
      dtype='object', length=130)

Creating Dependent Y column

In [5]:
#To create Y we need to sumup cardspent(first card spent amount) and card2spent(Second card spent amount)
custdata_df['totalspend'] = custdata_df['cardspent'] + custdata_df['card2spent']
In [6]:
custdata_df.head()
Out[6]:
custid region townsize gender age agecat birthmonth ed edcat jobcat union employ empcat retire income lninc inccat debtinc creddebt lncreddebt othdebt lnothdebt default jobsat marital spoused spousedcat reside pets pets_cats pets_dogs pets_birds pets_reptiles pets_small pets_saltfish pets_freshfish homeown hometype address addresscat cars carown cartype carvalue carcatvalue carbought carbuy commute commutecat commutetime commutecar commutemotorcycle commutecarpool commutebus commuterail commutepublic commutebike commutewalk commutenonmotor telecommute reason polview polparty polcontrib vote card cardtype cardbenefit cardfee cardtenure cardtenurecat card2 card2type card2benefit card2fee card2tenure card2tenurecat cardspent card2spent active bfast tenure churn longmon lnlongmon longten lnlongten tollfree tollmon lntollmon tollten lntollten equip equipmon lnequipmon equipten lnequipten callcard cardmon lncardmon cardten lncardten wireless wiremon lnwiremon wireten lnwireten multline voice pager internet callid callwait forward confer ebill owntv hourstv ownvcr owndvd owncd ownpda ownpc ownipod owngame ownfax news response_01 response_02 response_03 totalspend
0 3964-QJWTRG-NPN 1 2.0 1 20 2 September 15 3 1 1 0 1 0 31 3.433987 2 11.1 1.200909 0.183079 2.240091 0.806516 1 1 0 -1 -1 3 0 0 0 0 0 0 0 0 0 2 0 1 2 1 0 14.3 1 0 0 8 4 22.0 0 1 1 0 0 0 0 1 0 0 9 6 1 0 1 3 1 1 0 2 2 5 3 1 0 3 2 81.66 67.80 0 3 5 1 6.50 1.871802 34.40 3.538057 1 29.0 3.367296 161.05 5.081715 1 29.50 3.384390 126.1 4.837075 1 14.25 2.656757 60.0 4.094345 0 0.00 NaN 0.00 NaN 1 1 1 0 0 1 1 1 0 1 13 1 1 0 0 0 1 1 0 0 0 1 0 149.46
1 0648-AIPJSP-UVM 5 5.0 0 22 2 May 17 4 2 0 0 1 0 15 2.708050 1 18.6 1.222020 0.200505 1.567980 0.449788 1 1 0 -1 -1 2 6 0 0 0 0 0 0 6 1 3 2 1 2 1 1 6.8 1 0 0 1 1 29.0 1 0 0 1 0 0 1 0 1 1 9 4 1 0 0 2 4 1 0 4 2 4 1 3 0 4 2 42.60 34.94 1 1 39 0 8.90 2.186051 330.60 5.800909 0 0.0 NaN 0.00 NaN 1 54.85 4.004602 1975.0 7.588324 1 16.00 2.772589 610.0 6.413459 1 45.65 3.821004 1683.55 7.428660 1 1 1 4 1 0 1 0 1 1 18 1 1 1 1 1 1 1 1 1 0 0 0 77.54
2 5195-TLUDJE-HVO 3 4.0 1 67 6 June 14 2 2 0 16 5 0 35 3.555348 2 9.9 0.928620 -0.074056 2.536380 0.930738 0 4 1 13 2 3 3 2 1 0 0 0 0 0 1 1 30 5 3 1 1 18.8 1 0 1 4 3 24.0 1 0 1 1 1 0 0 0 0 0 2 5 1 0 0 2 1 4 0 35 5 4 1 3 0 25 5 184.22 175.75 0 3 65 0 28.40 3.346389 1858.35 7.527444 0 0.0 NaN 0.00 NaN 0 0.00 NaN 0.0 NaN 1 23.00 3.135494 1410.0 7.251345 0 0.00 NaN 0.00 NaN 1 0 0 0 0 0 0 0 0 1 21 1 1 1 0 0 0 0 0 1 0 0 0 359.97
3 4459-VLPQUH-3OL 4 3.0 0 23 2 May 16 3 2 0 0 1 0 20 2.995732 1 5.7 0.022800 -3.780995 1.117200 0.110826 1 2 1 18 4 5 0 0 0 0 0 0 0 0 1 3 3 2 3 1 1 8.7 1 0 1 1 1 38.0 1 0 0 0 0 0 0 0 0 0 9 3 0 0 0 2 1 4 0 5 2 3 2 4 0 5 2 340.99 18.42 1 1 36 0 6.00 1.791759 199.45 5.295564 0 0.0 NaN 0.00 NaN 0 0.00 NaN 0.0 NaN 1 21.00 3.044522 685.0 6.529419 0 0.00 NaN 0.00 NaN 1 0 0 2 0 0 0 0 1 1 26 1 1 1 0 1 1 1 0 1 1 0 0 359.41
4 8158-SMTQFB-CNO 2 2.0 0 26 3 July 16 3 2 0 1 1 0 23 3.135494 1 1.7 0.214659 -1.538705 0.176341 -1.735336 0 1 1 13 2 4 0 0 0 0 0 0 0 0 0 2 3 2 1 0 1 10.6 1 0 1 6 3 32.0 0 0 0 0 0 1 0 1 0 0 9 4 0 0 0 4 2 1 0 8 3 1 3 2 0 9 3 255.10 252.73 1 3 21 0 3.05 1.115142 74.10 4.305416 1 16.5 2.803360 387.70 5.960232 0 0.00 NaN 0.0 NaN 1 17.25 2.847812 360.0 5.886104 1 19.05 2.947067 410.80 6.018106 0 1 0 3 1 1 1 1 0 1 27 1 1 1 0 1 0 1 0 0 0 1 0 507.83
In [7]:
# Now Run pandas profiling to see the data audit reports

import pandas_profiling
pandas_profiling.ProfileReport(custdata_df)
Out[7]:

Overview

Dataset info

Number of variables 131
Number of observations 5000
Total Missing (%) 0.2%
Total size in memory 5.0 MiB
Average record size in memory 1.0 KiB

Variables types

Numeric 59
Categorical 1
Boolean 49
Date 0
Text (Unique) 1
Rejected 21
Unsupported 0

Warnings

Variables

active
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.466
0
2670
1
2330
Value Count Frequency (%)  
0 2670 53.4%
 
1 2330 46.6%
 

address
Numeric

Distinct count 57
Unique (%) 1.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 16.402
Minimum 0
Maximum 57
Zeros (%) 4.9%

Quantile statistics

Minimum 0
5-th percentile 1
Q1 6
Median 14
Q3 25
95-th percentile 40
Maximum 57
Range 57
Interquartile range 19

Descriptive statistics

Standard deviation 12.397
Coef of variation 0.75583
Kurtosis -0.22967
Mean 16.402
MAD 10.223
Skewness 0.70655
Sum 82012
Variance 153.7
Memory size 39.1 KiB
Value Count Frequency (%)  
0 245 4.9%
 
2 196 3.9%
 
4 195 3.9%
 
5 177 3.5%
 
3 172 3.4%
 
1 169 3.4%
 
8 169 3.4%
 
7 166 3.3%
 
12 166 3.3%
 
6 163 3.3%
 
Other values (47) 3182 63.6%
 

Minimum 5 values

Value Count Frequency (%)  
0 245 4.9%
 
1 169 3.4%
 
2 196 3.9%
 
3 172 3.4%
 
4 195 3.9%
 

Maximum 5 values

Value Count Frequency (%)  
52 7 0.1%
 
53 6 0.1%
 
54 1 0.0%
 
55 5 0.1%
 
57 3 0.1%
 

addresscat
Highly correlated

This variable is highly correlated with address and should be ignored for analysis

Correlation 0.92352

age
Numeric

Distinct count 62
Unique (%) 1.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 47.026
Minimum 18
Maximum 79
Zeros (%) 0.0%

Quantile statistics

Minimum 18
5-th percentile 20
Q1 31
Median 47
Q3 62
95-th percentile 76
Maximum 79
Range 61
Interquartile range 31

Descriptive statistics

Standard deviation 17.77
Coef of variation 0.37789
Kurtosis -1.187
Mean 47.026
MAD 15.403
Skewness 0.09076
Sum 235128
Variance 315.78
Memory size 39.1 KiB
Value Count Frequency (%)  
18 106 2.1%
 
35 102 2.0%
 
37 98 2.0%
 
24 97 1.9%
 
21 95 1.9%
 
63 95 1.9%
 
31 94 1.9%
 
57 93 1.9%
 
25 93 1.9%
 
36 92 1.8%
 
Other values (52) 4035 80.7%
 

Minimum 5 values

Value Count Frequency (%)  
18 106 2.1%
 
19 78 1.6%
 
20 80 1.6%
 
21 95 1.9%
 
22 82 1.6%
 

Maximum 5 values

Value Count Frequency (%)  
75 74 1.5%
 
76 58 1.2%
 
77 71 1.4%
 
78 70 1.4%
 
79 73 1.5%
 

agecat
Highly correlated

This variable is highly correlated with age and should be ignored for analysis

Correlation 0.96988

bfast
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.0586
Minimum 1
Maximum 3
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 2
Q3 3
95-th percentile 3
Maximum 3
Range 2
Interquartile range 2

Descriptive statistics

Standard deviation 0.82952
Coef of variation 0.40295
Kurtosis -1.5385
Mean 2.0586
MAD 0.70605
Skewness -0.10964
Sum 10293
Variance 0.6881
Memory size 39.1 KiB
Value Count Frequency (%)  
3 1875 37.5%
 
1 1582 31.6%
 
2 1543 30.9%
 

Minimum 5 values

Value Count Frequency (%)  
1 1582 31.6%
 
2 1543 30.9%
 
3 1875 37.5%
 

Maximum 5 values

Value Count Frequency (%)  
1 1582 31.6%
 
2 1543 30.9%
 
3 1875 37.5%
 

birthmonth
Categorical

Distinct count 12
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 0
September
 
458
May
 
451
January
 
420
Other values (9)
3671
Value Count Frequency (%)  
September 458 9.2%
 
May 451 9.0%
 
January 420 8.4%
 
June 420 8.4%
 
February 418 8.4%
 
March 416 8.3%
 
July 413 8.3%
 
October 410 8.2%
 
August 406 8.1%
 
November 399 8.0%
 
Other values (2) 789 15.8%
 

callcard
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.7162
1
3581
0
1419
Value Count Frequency (%)  
1 3581 71.6%
 
0 1419 28.4%
 

callid
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4752
0
2624
1
2376
Value Count Frequency (%)  
0 2624 52.5%
 
1 2376 47.5%
 

callwait
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.479
0
2605
1
2395
Value Count Frequency (%)  
0 2605 52.1%
 
1 2395 47.9%
 

carbought
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.221
Minimum -1
Maximum 1
Zeros (%) 58.0%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 0
Median 0
Q3 1
95-th percentile 1
Maximum 1
Range 2
Interquartile range 1

Descriptive statistics

Standard deviation 0.60912
Coef of variation 2.7562
Kurtosis -0.5264
Mean 0.221
MAD 0.49918
Skewness -0.15823
Sum 1105
Variance 0.37103
Memory size 39.1 KiB
Value Count Frequency (%)  
0 2901 58.0%
 
1 1602 32.0%
 
-1 497 9.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1 497 9.9%
 
0 2901 58.0%
 
1 1602 32.0%
 

Maximum 5 values

Value Count Frequency (%)  
-1 497 9.9%
 
0 2901 58.0%
 
1 1602 32.0%
 

carbuy
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.361
0
3195
1
1805
Value Count Frequency (%)  
0 3195 63.9%
 
1 1805 36.1%
 

carcatvalue
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.3894
Minimum -1
Maximum 3
Zeros (%) 0.0%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 1
Median 1
Q3 2
95-th percentile 3
Maximum 3
Range 4
Interquartile range 1

Descriptive statistics

Standard deviation 1.0813
Coef of variation 0.77825
Kurtosis 0.23064
Mean 1.3894
MAD 0.84868
Skewness -0.49643
Sum 6947
Variance 1.1692
Memory size 39.1 KiB
Value Count Frequency (%)  
1 2399 48.0%
 
2 1267 25.3%
 
3 837 16.7%
 
-1 497 9.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1 497 9.9%
 
1 2399 48.0%
 
2 1267 25.3%
 
3 837 16.7%
 

Maximum 5 values

Value Count Frequency (%)  
-1 497 9.9%
 
1 2399 48.0%
 
2 1267 25.3%
 
3 837 16.7%
 

card
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.7142
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 4
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.1849
Coef of variation 0.43656
Kurtosis -1.1112
Mean 2.7142
MAD 1.0323
Skewness 0.015333
Sum 13571
Variance 1.404
Memory size 39.1 KiB
Value Count Frequency (%)  
4 1344 26.9%
 
2 1247 24.9%
 
3 1200 24.0%
 
1 986 19.7%
 
5 223 4.5%
 

Minimum 5 values

Value Count Frequency (%)  
1 986 19.7%
 
2 1247 24.9%
 
3 1200 24.0%
 
4 1344 26.9%
 
5 223 4.5%
 

Maximum 5 values

Value Count Frequency (%)  
1 986 19.7%
 
2 1247 24.9%
 
3 1200 24.0%
 
4 1344 26.9%
 
5 223 4.5%
 

card2
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.7744
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 5
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.1734
Coef of variation 0.42296
Kurtosis -0.91791
Mean 2.7744
MAD 0.99139
Skewness 0.084736
Sum 13872
Variance 1.377
Memory size 39.1 KiB
Value Count Frequency (%)  
3 1384 27.7%
 
2 1301 26.0%
 
4 1141 22.8%
 
1 829 16.6%
 
5 345 6.9%
 

Minimum 5 values

Value Count Frequency (%)  
1 829 16.6%
 
2 1301 26.0%
 
3 1384 27.7%
 
4 1141 22.8%
 
5 345 6.9%
 

Maximum 5 values

Value Count Frequency (%)  
1 829 16.6%
 
2 1301 26.0%
 
3 1384 27.7%
 
4 1141 22.8%
 
5 345 6.9%
 

card2benefit
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.534
Minimum 1
Maximum 4
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 4
Maximum 4
Range 3
Interquartile range 2

Descriptive statistics

Standard deviation 1.1173
Coef of variation 0.44091
Kurtosis -1.3562
Mean 2.534
MAD 0.99851
Skewness -0.046519
Sum 12670
Variance 1.2483
Memory size 39.1 KiB
Value Count Frequency (%)  
4 1294 25.9%
 
3 1286 25.7%
 
2 1216 24.3%
 
1 1204 24.1%
 

Minimum 5 values

Value Count Frequency (%)  
1 1204 24.1%
 
2 1216 24.3%
 
3 1286 25.7%
 
4 1294 25.9%
 

Maximum 5 values

Value Count Frequency (%)  
1 1204 24.1%
 
2 1216 24.3%
 
3 1286 25.7%
 
4 1294 25.9%
 

card2fee
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1872
0
4064
1
936
Value Count Frequency (%)  
0 4064 81.3%
 
1 936 18.7%
 

card2spent
Numeric

Distinct count 4477
Unique (%) 89.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 160.88
Minimum 0
Maximum 2069.2
Zeros (%) 3.6%

Quantile statistics

Minimum 0
5-th percentile 14.819
Q1 66.968
Median 125.34
Q3 208.31
95-th percentile 419.45
Maximum 2069.2
Range 2069.2
Interquartile range 141.34

Descriptive statistics

Standard deviation 146.29
Coef of variation 0.90935
Kurtosis 15.736
Mean 160.88
MAD 100.44
Skewness 2.8012
Sum 804380
Variance 21402
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 179 3.6%
 
63.690000000000005 3 0.1%
 
92.92 3 0.1%
 
175.75 3 0.1%
 
97.87 3 0.1%
 
112.88 3 0.1%
 
128.54 3 0.1%
 
159.1 3 0.1%
 
38.410000000000004 3 0.1%
 
128.35 3 0.1%
 
Other values (4467) 4794 95.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 179 3.6%
 
6.1000000000000005 1 0.0%
 
6.54 1 0.0%
 
6.86 1 0.0%
 
7.140000000000001 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
1277.68 1 0.0%
 
1282.76 1 0.0%
 
1309.3700000000001 1 0.0%
 
1611.3500000000001 1 0.0%
 
2069.25 1 0.0%
 

card2tenure
Highly correlated

This variable is highly correlated with cardtenure and should be ignored for analysis

Correlation 0.96298

card2tenurecat
Highly correlated

This variable is highly correlated with card2tenure and should be ignored for analysis

Correlation 0.92439

card2type
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.5412
Minimum 1
Maximum 4
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 4
Maximum 4
Range 3
Interquartile range 2

Descriptive statistics

Standard deviation 1.1188
Coef of variation 0.44027
Kurtosis -1.3601
Mean 2.5412
MAD 1.0003
Skewness -0.04748
Sum 12706
Variance 1.2518
Memory size 39.1 KiB
Value Count Frequency (%)  
4 1319 26.4%
 
3 1257 25.1%
 
2 1235 24.7%
 
1 1189 23.8%
 

Minimum 5 values

Value Count Frequency (%)  
1 1189 23.8%
 
2 1235 24.7%
 
3 1257 25.1%
 
4 1319 26.4%
 

Maximum 5 values

Value Count Frequency (%)  
1 1189 23.8%
 
2 1235 24.7%
 
3 1257 25.1%
 
4 1319 26.4%
 

cardbenefit
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.5058
Minimum 1
Maximum 4
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 3.25
95-th percentile 4
Maximum 4
Range 3
Interquartile range 1.25

Descriptive statistics

Standard deviation 1.1172
Coef of variation 0.44586
Kurtosis -1.3579
Mean 2.5058
MAD 0.99894
Skewness -0.012388
Sum 12529
Variance 1.2482
Memory size 39.1 KiB
Value Count Frequency (%)  
3 1274 25.5%
 
4 1250 25.0%
 
1 1245 24.9%
 
2 1231 24.6%
 

Minimum 5 values

Value Count Frequency (%)  
1 1245 24.9%
 
2 1231 24.6%
 
3 1274 25.5%
 
4 1250 25.0%
 

Maximum 5 values

Value Count Frequency (%)  
1 1245 24.9%
 
2 1231 24.6%
 
3 1274 25.5%
 
4 1250 25.0%
 

cardfee
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1898
0
4051
1
949
Value Count Frequency (%)  
0 4051 81.0%
 
1 949 19.0%
 

cardmon
Numeric

Distinct count 271
Unique (%) 5.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 15.444
Minimum 0
Maximum 188.5
Zeros (%) 28.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 13.75
Q3 22.75
95-th percentile 42
Maximum 188.5
Range 188.5
Interquartile range 22.75

Descriptive statistics

Standard deviation 15.008
Coef of variation 0.97175
Kurtosis 7.1671
Mean 15.444
MAD 11.245
Skewness 1.6877
Sum 77219
Variance 225.23
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 1419 28.4%
 
13.25 53 1.1%
 
11.5 52 1.0%
 
16.5 49 1.0%
 
16.25 49 1.0%
 
13.75 47 0.9%
 
18.25 45 0.9%
 
13.5 45 0.9%
 
14.25 44 0.9%
 
15.0 44 0.9%
 
Other values (261) 3153 63.1%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 1419 28.4%
 
3.25 1 0.0%
 
3.75 1 0.0%
 
4.0 3 0.1%
 
4.25 9 0.2%
 

Maximum 5 values

Value Count Frequency (%)  
100.25 1 0.0%
 
102.0 1 0.0%
 
104.5 1 0.0%
 
138.25 1 0.0%
 
188.5 1 0.0%
 

cardspent
Numeric

Distinct count 4760
Unique (%) 95.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 337.2
Minimum 0
Maximum 3926.4
Zeros (%) 0.1%

Quantile statistics

Minimum 0
5-th percentile 91.305
Q1 183.38
Median 276.36
Q3 418.54
95-th percentile 782.32
Maximum 3926.4
Range 3926.4
Interquartile range 235.16

Descriptive statistics

Standard deviation 245.15
Coef of variation 0.727
Kurtosis 21.44
Mean 337.2
MAD 167.79
Skewness 3.0512
Sum 1686000
Variance 60096
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 7 0.1%
 
186.91 4 0.1%
 
245.84 3 0.1%
 
321.19 3 0.1%
 
231.14000000000001 3 0.1%
 
202.31 3 0.1%
 
237.16 3 0.1%
 
412.99 3 0.1%
 
122.54 3 0.1%
 
249.0 3 0.1%
 
Other values (4750) 4965 99.3%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 7 0.1%
 
6.97 1 0.0%
 
7.34 1 0.0%
 
7.53 1 0.0%
 
8.11 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2461.03 1 0.0%
 
2503.25 1 0.0%
 
2969.39 1 0.0%
 
3104.63 1 0.0%
 
3926.41 1 0.0%
 

cardten
Numeric

Distinct count 698
Unique (%) 14.0%
Missing (%) 0.0%
Missing (n) 2
Infinite (%) 0.0%
Infinite (n) 0
Mean 720.48
Minimum 0
Maximum 13705
Zeros (%) 28.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 425
Q3 1080
95-th percentile 2455.7
Maximum 13705
Range 13705
Interquartile range 1080

Descriptive statistics

Standard deviation 922.23
Coef of variation 1.28
Kurtosis 15.163
Mean 720.48
MAD 667.37
Skewness 2.6459
Sum 3601000
Variance 850500
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 1420 28.4%
 
590.0 21 0.4%
 
200.0 20 0.4%
 
380.0 20 0.4%
 
45.0 19 0.4%
 
195.0 19 0.4%
 
500.0 19 0.4%
 
330.0 18 0.4%
 
220.0 18 0.4%
 
435.0 18 0.4%
 
Other values (687) 3406 68.1%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 1420 28.4%
 
4.75 1 0.0%
 
5.0 17 0.3%
 
5.25 1 0.0%
 
7.75 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
6440.0 1 0.0%
 
7115.0 1 0.0%
 
7310.0 1 0.0%
 
9920.0 1 0.0%
 
13705.0 1 0.0%
 

cardtenure
Numeric

Distinct count 41
Unique (%) 0.8%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 16.656
Minimum 0
Maximum 40
Zeros (%) 1.8%

Quantile statistics

Minimum 0
5-th percentile 1
Q1 6
Median 14
Q3 26
95-th percentile 38
Maximum 40
Range 40
Interquartile range 20

Descriptive statistics

Standard deviation 12.021
Coef of variation 0.72173
Kurtosis -1.0561
Mean 16.656
MAD 10.355
Skewness 0.42936
Sum 83279
Variance 144.5
Memory size 39.1 KiB
Value Count Frequency (%)  
3 246 4.9%
 
1 228 4.6%
 
2 220 4.4%
 
4 193 3.9%
 
5 188 3.8%
 
6 176 3.5%
 
7 163 3.3%
 
11 158 3.2%
 
8 158 3.2%
 
9 153 3.1%
 
Other values (31) 3117 62.3%
 

Minimum 5 values

Value Count Frequency (%)  
0 91 1.8%
 
1 228 4.6%
 
2 220 4.4%
 
3 246 4.9%
 
4 193 3.9%
 

Maximum 5 values

Value Count Frequency (%)  
36 72 1.4%
 
37 83 1.7%
 
38 98 2.0%
 
39 113 2.3%
 
40 126 2.5%
 

cardtenurecat
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.7822
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 3
Median 4
Q3 5
95-th percentile 5
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.3538
Coef of variation 0.35794
Kurtosis -1.0266
Mean 3.7822
MAD 1.2057
Skewness -0.62824
Sum 18911
Variance 1.8327
Memory size 39.1 KiB
Value Count Frequency (%)  
5 2351 47.0%
 
2 847 16.9%
 
3 789 15.8%
 
4 694 13.9%
 
1 319 6.4%
 

Minimum 5 values

Value Count Frequency (%)  
1 319 6.4%
 
2 847 16.9%
 
3 789 15.8%
 
4 694 13.9%
 
5 2351 47.0%
 

Maximum 5 values

Value Count Frequency (%)  
1 319 6.4%
 
2 847 16.9%
 
3 789 15.8%
 
4 694 13.9%
 
5 2351 47.0%
 

cardtype
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.507
Minimum 1
Maximum 4
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 4
Maximum 4
Range 3
Interquartile range 2

Descriptive statistics

Standard deviation 1.1185
Coef of variation 0.44614
Kurtosis -1.3608
Mean 2.507
MAD 1.0004
Skewness -0.0098086
Sum 12535
Variance 1.251
Memory size 39.1 KiB
Value Count Frequency (%)  
4 1260 25.2%
 
3 1257 25.1%
 
1 1242 24.8%
 
2 1241 24.8%
 

Minimum 5 values

Value Count Frequency (%)  
1 1242 24.8%
 
2 1241 24.8%
 
3 1257 25.1%
 
4 1260 25.2%
 

Maximum 5 values

Value Count Frequency (%)  
1 1242 24.8%
 
2 1241 24.8%
 
3 1257 25.1%
 
4 1260 25.2%
 

carown
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.6414
Minimum -1
Maximum 1
Zeros (%) 16.0%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 0
Median 1
Q3 1
95-th percentile 1
Maximum 1
Range 2
Interquartile range 1

Descriptive statistics

Standard deviation 0.6549
Coef of variation 1.021
Kurtosis 1.14
Mean 0.6414
MAD 0.5313
Skewness -1.5944
Sum 3207
Variance 0.42889
Memory size 39.1 KiB
Value Count Frequency (%)  
1 3704 74.1%
 
0 799 16.0%
 
-1 497 9.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1 497 9.9%
 
0 799 16.0%
 
1 3704 74.1%
 

Maximum 5 values

Value Count Frequency (%)  
-1 497 9.9%
 
0 799 16.0%
 
1 3704 74.1%
 

cars
Numeric

Distinct count 9
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.1306
Minimum 0
Maximum 8
Zeros (%) 9.9%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 1
Median 2
Q3 3
95-th percentile 4
Maximum 8
Range 8
Interquartile range 2

Descriptive statistics

Standard deviation 1.3075
Coef of variation 0.61366
Kurtosis 0.32839
Mean 2.1306
MAD 1.0136
Skewness 0.50172
Sum 10653
Variance 1.7095
Memory size 39.1 KiB
Value Count Frequency (%)  
2 1607 32.1%
 
1 1119 22.4%
 
3 1082 21.6%
 
0 497 9.9%
 
4 481 9.6%
 
5 149 3.0%
 
6 51 1.0%
 
7 13 0.3%
 
8 1 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
0 497 9.9%
 
1 1119 22.4%
 
2 1607 32.1%
 
3 1082 21.6%
 
4 481 9.6%
 

Maximum 5 values

Value Count Frequency (%)  
4 481 9.6%
 
5 149 3.0%
 
6 51 1.0%
 
7 13 0.3%
 
8 1 0.0%
 

cartype
Numeric

Distinct count 3
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.3438
Minimum -1
Maximum 1
Zeros (%) 45.7%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 0
Median 0
Q3 1
95-th percentile 1
Maximum 1
Range 2
Interquartile range 1

Descriptive statistics

Standard deviation 0.65153
Coef of variation 1.8951
Kurtosis -0.70821
Mean 0.3438
MAD 0.58166
Skewness -0.48685
Sum 1719
Variance 0.42449
Memory size 39.1 KiB
Value Count Frequency (%)  
0 2287 45.7%
 
1 2216 44.3%
 
-1 497 9.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1 497 9.9%
 
0 2287 45.7%
 
1 2216 44.3%
 

Maximum 5 values

Value Count Frequency (%)  
-1 497 9.9%
 
0 2287 45.7%
 
1 2216 44.3%
 

carvalue
Numeric

Distinct count 801
Unique (%) 16.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 23.233
Minimum -1
Maximum 99.6
Zeros (%) 0.0%

Quantile statistics

Minimum -1
5-th percentile -1
Q1 9.2
Median 17
Q3 31.1
95-th percentile 72
Maximum 99.6
Range 100.6
Interquartile range 21.9

Descriptive statistics

Standard deviation 21.232
Coef of variation 0.91387
Kurtosis 1.9517
Mean 23.233
MAD 15.904
Skewness 1.474
Sum 116160
Variance 450.78
Memory size 39.1 KiB
Value Count Frequency (%)  
-1.0 497 9.9%
 
9.8 25 0.5%
 
13.5 24 0.5%
 
6.300000000000001 24 0.5%
 
10.200000000000001 23 0.5%
 
13.0 23 0.5%
 
11.4 22 0.4%
 
9.1 22 0.4%
 
9.200000000000001 22 0.4%
 
9.9 22 0.4%
 
Other values (791) 4296 85.9%
 

Minimum 5 values

Value Count Frequency (%)  
-1.0 497 9.9%
 
2.2 1 0.0%
 
2.3000000000000003 1 0.0%
 
2.4000000000000004 1 0.0%
 
2.5 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
98.2 1 0.0%
 
98.5 4 0.1%
 
98.80000000000001 1 0.0%
 
99.2 1 0.0%
 
99.60000000000001 1 0.0%
 

churn
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2532
0
3734
1
1266
Value Count Frequency (%)  
0 3734 74.7%
 
1 1266 25.3%
 

commute
Numeric

Distinct count 10
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.9962
Minimum 1
Maximum 10
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 1
Q3 4
95-th percentile 8
Maximum 10
Range 9
Interquartile range 3

Descriptive statistics

Standard deviation 2.7435
Coef of variation 0.91567
Kurtosis -0.045572
Mean 2.9962
MAD 2.2996
Skewness 1.1277
Sum 14981
Variance 7.5269
Memory size 39.1 KiB
Value Count Frequency (%)  
1 2855 57.1%
 
4 635 12.7%
 
8 585 11.7%
 
5 302 6.0%
 
3 295 5.9%
 
10 153 3.1%
 
7 56 1.1%
 
2 50 1.0%
 
6 44 0.9%
 
9 25 0.5%
 

Minimum 5 values

Value Count Frequency (%)  
1 2855 57.1%
 
2 50 1.0%
 
3 295 5.9%
 
4 635 12.7%
 
5 302 6.0%
 

Maximum 5 values

Value Count Frequency (%)  
6 44 0.9%
 
7 56 1.1%
 
8 585 11.7%
 
9 25 0.5%
 
10 153 3.1%
 

commutebike
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1234
0
4383
1
 
617
Value Count Frequency (%)  
0 4383 87.7%
 
1 617 12.3%
 

commutebus
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.406
0
2970
1
2030
Value Count Frequency (%)  
0 2970 59.4%
 
1 2030 40.6%
 

commutecar
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.679
1
3395
0
1605
Value Count Frequency (%)  
1 3395 67.9%
 
0 1605 32.1%
 

commutecarpool
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2718
0
3641
1
1359
Value Count Frequency (%)  
0 3641 72.8%
 
1 1359 27.2%
 

commutecat
Highly correlated

This variable is highly correlated with commute and should be ignored for analysis

Correlation 0.98117

commutemotorcycle
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1026
0
4487
1
 
513
Value Count Frequency (%)  
0 4487 89.7%
 
1 513 10.3%
 

commutenonmotor
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.0584
0
4708
1
 
292
Value Count Frequency (%)  
0 4708 94.2%
 
1 292 5.8%
 

commutepublic
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.0954
0
4523
1
 
477
Value Count Frequency (%)  
0 4523 90.5%
 
1 477 9.5%
 

commuterail
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2746
0
3627
1
1373
Value Count Frequency (%)  
0 3627 72.5%
 
1 1373 27.5%
 

commutetime
Numeric

Distinct count 42
Unique (%) 0.8%
Missing (%) 0.0%
Missing (n) 2
Infinite (%) 0.0%
Infinite (n) 0
Mean 25.346
Minimum 8
Maximum 48
Zeros (%) 0.0%

Quantile statistics

Minimum 8
5-th percentile 16
Q1 21
Median 25
Q3 29
95-th percentile 35
Maximum 48
Range 40
Interquartile range 8

Descriptive statistics

Standard deviation 5.8791
Coef of variation 0.23196
Kurtosis 0.13487
Mean 25.346
MAD 4.6895
Skewness 0.29028
Sum 126680
Variance 34.564
Memory size 39.1 KiB
Value Count Frequency (%)  
24.0 336 6.7%
 
23.0 335 6.7%
 
27.0 331 6.6%
 
25.0 330 6.6%
 
22.0 325 6.5%
 
26.0 311 6.2%
 
21.0 307 6.1%
 
28.0 293 5.9%
 
29.0 260 5.2%
 
30.0 226 4.5%
 
Other values (31) 1944 38.9%
 

Minimum 5 values

Value Count Frequency (%)  
8.0 1 0.0%
 
9.0 6 0.1%
 
10.0 4 0.1%
 
11.0 9 0.2%
 
12.0 22 0.4%
 

Maximum 5 values

Value Count Frequency (%)  
44.0 4 0.1%
 
45.0 4 0.1%
 
46.0 6 0.1%
 
47.0 1 0.0%
 
48.0 1 0.0%
 

commutewalk
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.3838
0
3081
1
1919
Value Count Frequency (%)  
0 3081 61.6%
 
1 1919 38.4%
 

confer
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.478
0
2610
1
2390
Value Count Frequency (%)  
0 2610 52.2%
 
1 2390 47.8%
 

creddebt
Numeric

Distinct count 4950
Unique (%) 99.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.8573
Minimum 0
Maximum 109.07
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 0.10109
Q1 0.38552
Median 0.92644
Q3 2.0638
95-th percentile 6.373
Maximum 109.07
Range 109.07
Interquartile range 1.6783

Descriptive statistics

Standard deviation 3.4157
Coef of variation 1.8391
Kurtosis 248.53
Mean 1.8573
MAD 1.669
Skewness 11.12
Sum 9286.6
Variance 11.667
Memory size 39.1 KiB
Value Count Frequency (%)  
0.272 2 0.0%
 
0.23587200000000003 2 0.0%
 
0.17171999999999998 2 0.0%
 
0.31600799999999996 2 0.0%
 
1.6744 2 0.0%
 
1.254 2 0.0%
 
0.658368 2 0.0%
 
0.129778 2 0.0%
 
0.4984199999999999 2 0.0%
 
0.66528 2 0.0%
 
Other values (4940) 4980 99.6%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 1 0.0%
 
0.001364 1 0.0%
 
0.00341 1 0.0%
 
0.00494 1 0.0%
 
0.006344 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
42.0985 1 0.0%
 
44.245560000000005 1 0.0%
 
48.704524 1 0.0%
 
67.49085 1 0.0%
 
109.072596 1 0.0%
 

custid
Categorical, Unique

First 3 values
0394-AVUMJX-JAH
0191-VKRHCM-922
9844-XTDOZB-DSM
Last 3 values
6015-ASUOWY-VXJ
7186-WTDJGD-F2K
6289-YVKMBB-CXK

First 10 values

Value Count Frequency (%)  
0002-GTOKLU-YVY 1 0.0%
 
0003-RLTRGE-IW2 1 0.0%
 
0003-UTGKPR-PRU 1 0.0%
 
0008-ZIQQOT-SGB 1 0.0%
 
0012-CIVYLF-839 1 0.0%
 

Last 10 values

Value Count Frequency (%)  
9991-FCIBKT-W29 1 0.0%
 
9992-FSFJPL-5D6 1 0.0%
 
9997-QIXKNU-54A 1 0.0%
 
9998-ZGHXLK-EOT 1 0.0%
 
9999-EGLHVE-19G 1 0.0%
 

debtinc
Numeric

Distinct count 325
Unique (%) 6.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 9.9542
Minimum 0
Maximum 43.1
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 1.9
Q1 5.1
Median 8.8
Q3 13.6
95-th percentile 22.2
Maximum 43.1
Range 43.1
Interquartile range 8.5

Descriptive statistics

Standard deviation 6.3998
Coef of variation 0.64293
Kurtosis 1.3765
Mean 9.9542
MAD 5.0211
Skewness 1.0619
Sum 49771
Variance 40.957
Memory size 39.1 KiB
Value Count Frequency (%)  
7.000000000000001 48 1.0%
 
6.9 46 0.9%
 
4.1000000000000005 46 0.9%
 
5.4 45 0.9%
 
4.3999999999999995 42 0.8%
 
6.6000000000000005 42 0.8%
 
7.3 41 0.8%
 
5.8999999999999995 39 0.8%
 
11.3 39 0.8%
 
5.5 38 0.8%
 
Other values (315) 4574 91.5%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 1 0.0%
 
0.1 6 0.1%
 
0.2 2 0.0%
 
0.3 5 0.1%
 
0.4 6 0.1%
 

Maximum 5 values

Value Count Frequency (%)  
38.2 1 0.0%
 
40.699999999999996 1 0.0%
 
41.0 1 0.0%
 
41.699999999999996 1 0.0%
 
43.1 1 0.0%
 

default
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2342
0
3829
1
1171
Value Count Frequency (%)  
0 3829 76.6%
 
1 1171 23.4%
 

ebill
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.3486
0
3257
1
1743
Value Count Frequency (%)  
0 3257 65.1%
 
1 1743 34.9%
 

ed
Numeric

Distinct count 18
Unique (%) 0.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 14.543
Minimum 6
Maximum 23
Zeros (%) 0.0%

Quantile statistics

Minimum 6
5-th percentile 9
Q1 12
Median 14
Q3 17
95-th percentile 20
Maximum 23
Range 17
Interquartile range 5

Descriptive statistics

Standard deviation 3.2811
Coef of variation 0.22561
Kurtosis -0.60706
Mean 14.543
MAD 2.7074
Skewness 0.0037335
Sum 72715
Variance 10.766
Memory size 39.1 KiB
Value Count Frequency (%)  
14 569 11.4%
 
15 536 10.7%
 
13 531 10.6%
 
16 486 9.7%
 
12 467 9.3%
 
17 454 9.1%
 
11 362 7.2%
 
18 349 7.0%
 
19 308 6.2%
 
10 260 5.2%
 
Other values (8) 678 13.6%
 

Minimum 5 values

Value Count Frequency (%)  
6 8 0.2%
 
7 31 0.6%
 
8 107 2.1%
 
9 178 3.6%
 
10 260 5.2%
 

Maximum 5 values

Value Count Frequency (%)  
19 308 6.2%
 
20 206 4.1%
 
21 111 2.2%
 
22 33 0.7%
 
23 4 0.1%
 

edcat
Highly correlated

This variable is highly correlated with ed and should be ignored for analysis

Correlation 0.9639

empcat
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.9326
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 5
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.4533
Coef of variation 0.49557
Kurtosis -1.3354
Mean 2.9326
MAD 1.2503
Skewness 0.15135
Sum 14663
Variance 2.1121
Memory size 39.1 KiB
Value Count Frequency (%)  
2 1180 23.6%
 
5 1135 22.7%
 
1 1048 21.0%
 
3 968 19.4%
 
4 669 13.4%
 

Minimum 5 values

Value Count Frequency (%)  
1 1048 21.0%
 
2 1180 23.6%
 
3 968 19.4%
 
4 669 13.4%
 
5 1135 22.7%
 

Maximum 5 values

Value Count Frequency (%)  
1 1048 21.0%
 
2 1180 23.6%
 
3 968 19.4%
 
4 669 13.4%
 
5 1135 22.7%
 

employ
Numeric

Distinct count 52
Unique (%) 1.0%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 9.7304
Minimum 0
Maximum 52
Zeros (%) 13.2%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 2
Median 7
Q3 15
95-th percentile 31
Maximum 52
Range 52
Interquartile range 13

Descriptive statistics

Standard deviation 9.6909
Coef of variation 0.99594
Kurtosis 1.0529
Mean 9.7304
MAD 7.6646
Skewness 1.2519
Sum 48652
Variance 93.914
Memory size 39.1 KiB
Value Count Frequency (%)  
0 659 13.2%
 
1 389 7.8%
 
2 318 6.4%
 
3 309 6.2%
 
4 293 5.9%
 
5 260 5.2%
 
6 250 5.0%
 
7 191 3.8%
 
8 187 3.7%
 
11 184 3.7%
 
Other values (42) 1960 39.2%
 

Minimum 5 values

Value Count Frequency (%)  
0 659 13.2%
 
1 389 7.8%
 
2 318 6.4%
 
3 309 6.2%
 
4 293 5.9%
 

Maximum 5 values

Value Count Frequency (%)  
47 1 0.0%
 
48 1 0.0%
 
49 1 0.0%
 
51 1 0.0%
 
52 1 0.0%
 

equip
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.3408
0
3296
1
1704
Value Count Frequency (%)  
0 3296 65.9%
 
1 1704 34.1%
 

equipmon
Highly correlated

This variable is highly correlated with equip and should be ignored for analysis

Correlation 0.94051

equipten
Numeric

Distinct count 1683
Unique (%) 33.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 470.18
Minimum 0
Maximum 6525.3
Zeros (%) 65.9%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 510.16
95-th percentile 2601
Maximum 6525.3
Range 6525.3
Interquartile range 510.16

Descriptive statistics

Standard deviation 912.22
Coef of variation 1.9402
Kurtosis 4.7863
Mean 470.18
MAD 664.55
Skewness 2.2149
Sum 2350900
Variance 832150
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 3296 65.9%
 
1259.35 2 0.0%
 
446.45 2 0.0%
 
1918.8 2 0.0%
 
824.3 2 0.0%
 
2778.3 2 0.0%
 
723.2 2 0.0%
 
163.4 2 0.0%
 
206.7 2 0.0%
 
224.7 2 0.0%
 
Other values (1673) 1686 33.7%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3296 65.9%
 
12.05 1 0.0%
 
14.65 1 0.0%
 
14.85 1 0.0%
 
16.1 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
5174.45 1 0.0%
 
5996.85 1 0.0%
 
6014.7 1 0.0%
 
6158.95 1 0.0%
 
6525.3 1 0.0%
 

forward
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4806
0
2597
1
2403
Value Count Frequency (%)  
0 2597 51.9%
 
1 2403 48.1%
 

gender
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.5036
1
2518
0
2482
Value Count Frequency (%)  
1 2518 50.4%
 
0 2482 49.6%
 

homeown
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.6296
1
3148
0
1852
Value Count Frequency (%)  
1 3148 63.0%
 
0 1852 37.0%
 

hometype
Numeric

Distinct count 4
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.8426
Minimum 1
Maximum 4
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 2
Q3 2
95-th percentile 4
Maximum 4
Range 3
Interquartile range 1

Descriptive statistics

Standard deviation 0.91673
Coef of variation 0.49752
Kurtosis -0.43415
Mean 1.8426
MAD 0.7634
Skewness 0.76947
Sum 9213
Variance 0.84039
Memory size 39.1 KiB
Value Count Frequency (%)  
1 2265 45.3%
 
2 1548 31.0%
 
3 896 17.9%
 
4 291 5.8%
 

Minimum 5 values

Value Count Frequency (%)  
1 2265 45.3%
 
2 1548 31.0%
 
3 896 17.9%
 
4 291 5.8%
 

Maximum 5 values

Value Count Frequency (%)  
1 2265 45.3%
 
2 1548 31.0%
 
3 896 17.9%
 
4 291 5.8%
 

hourstv
Numeric

Distinct count 32
Unique (%) 0.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 19.645
Minimum 0
Maximum 36
Zeros (%) 1.7%

Quantile statistics

Minimum 0
5-th percentile 12
Q1 17
Median 20
Q3 23
95-th percentile 28
Maximum 36
Range 36
Interquartile range 6

Descriptive statistics

Standard deviation 5.1656
Coef of variation 0.26295
Kurtosis 2.3484
Mean 19.645
MAD 3.8622
Skewness -0.64471
Sum 98225
Variance 26.684
Memory size 39.1 KiB
Value Count Frequency (%)  
20 451 9.0%
 
19 445 8.9%
 
21 440 8.8%
 
18 413 8.3%
 
22 371 7.4%
 
17 350 7.0%
 
16 309 6.2%
 
23 301 6.0%
 
15 263 5.3%
 
24 248 5.0%
 
Other values (22) 1409 28.2%
 

Minimum 5 values

Value Count Frequency (%)  
0 85 1.7%
 
6 1 0.0%
 
7 3 0.1%
 
8 9 0.2%
 
9 13 0.3%
 

Maximum 5 values

Value Count Frequency (%)  
32 10 0.2%
 
33 8 0.2%
 
34 9 0.2%
 
35 6 0.1%
 
36 3 0.1%
 

inccat
Highly correlated

This variable is highly correlated with lninc and should be ignored for analysis

Correlation 0.94879

income
Numeric

Distinct count 266
Unique (%) 5.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 54.76
Minimum 9
Maximum 1073
Zeros (%) 0.0%

Quantile statistics

Minimum 9
5-th percentile 13
Q1 24
Median 38
Q3 67
95-th percentile 147
Maximum 1073
Range 1064
Interquartile range 43

Descriptive statistics

Standard deviation 55.378
Coef of variation 1.0113
Kurtosis 57.077
Mean 54.76
MAD 34.063
Skewness 5.1792
Sum 273798
Variance 3066.7
Memory size 39.1 KiB
Value Count Frequency (%)  
22 112 2.2%
 
29 109 2.2%
 
25 108 2.2%
 
20 102 2.0%
 
30 102 2.0%
 
18 100 2.0%
 
23 100 2.0%
 
24 99 2.0%
 
32 93 1.9%
 
16 91 1.8%
 
Other values (256) 3984 79.7%
 

Minimum 5 values

Value Count Frequency (%)  
9 83 1.7%
 
10 55 1.1%
 
11 57 1.1%
 
12 52 1.0%
 
13 56 1.1%
 

Maximum 5 values

Value Count Frequency (%)  
575 1 0.0%
 
642 1 0.0%
 
780 1 0.0%
 
995 1 0.0%
 
1073 1 0.0%
 

internet
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.1996
Minimum 0
Maximum 4
Zeros (%) 50.0%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 1
Q3 2
95-th percentile 4
Maximum 4
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.4493
Coef of variation 1.2082
Kurtosis -0.83856
Mean 1.1996
MAD 1.2604
Skewness 0.80841
Sum 5998
Variance 2.1006
Memory size 39.1 KiB
Value Count Frequency (%)  
0 2498 50.0%
 
1 774 15.5%
 
3 598 12.0%
 
4 585 11.7%
 
2 545 10.9%
 

Minimum 5 values

Value Count Frequency (%)  
0 2498 50.0%
 
1 774 15.5%
 
2 545 10.9%
 
3 598 12.0%
 
4 585 11.7%
 

Maximum 5 values

Value Count Frequency (%)  
0 2498 50.0%
 
1 774 15.5%
 
2 545 10.9%
 
3 598 12.0%
 
4 585 11.7%
 

jobcat
Numeric

Distinct count 6
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.7528
Minimum 1
Maximum 6
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 2
Q3 4
95-th percentile 6
Maximum 6
Range 5
Interquartile range 3

Descriptive statistics

Standard deviation 1.7379
Coef of variation 0.63132
Kurtosis -0.75877
Mean 2.7528
MAD 1.467
Skewness 0.79807
Sum 13764
Variance 3.0203
Memory size 39.1 KiB
Value Count Frequency (%)  
2 1640 32.8%
 
1 1388 27.8%
 
6 688 13.8%
 
3 620 12.4%
 
5 452 9.0%
 
4 212 4.2%
 

Minimum 5 values

Value Count Frequency (%)  
1 1388 27.8%
 
2 1640 32.8%
 
3 620 12.4%
 
4 212 4.2%
 
5 452 9.0%
 

Maximum 5 values

Value Count Frequency (%)  
2 1640 32.8%
 
3 620 12.4%
 
4 212 4.2%
 
5 452 9.0%
 
6 688 13.8%
 

jobsat
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.9642
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 5
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.3795
Coef of variation 0.46537
Kurtosis -1.2367
Mean 2.9642
MAD 1.1637
Skewness 0.02675
Sum 14821
Variance 1.9029
Memory size 39.1 KiB
Value Count Frequency (%)  
3 1085 21.7%
 
2 1031 20.6%
 
4 1016 20.3%
 
1 975 19.5%
 
5 893 17.9%
 

Minimum 5 values

Value Count Frequency (%)  
1 975 19.5%
 
2 1031 20.6%
 
3 1085 21.7%
 
4 1016 20.3%
 
5 893 17.9%
 

Maximum 5 values

Value Count Frequency (%)  
1 975 19.5%
 
2 1031 20.6%
 
3 1085 21.7%
 
4 1016 20.3%
 
5 893 17.9%
 

lncardmon
Highly correlated

This variable is highly correlated with cardmon and should be ignored for analysis

Correlation 0.91687

lncardten
Numeric

Distinct count 697
Unique (%) 13.9%
Missing (%) 28.4%
Missing (n) 1422
Infinite (%) 0.0%
Infinite (n) 0
Mean 6.4263
Minimum 1.5581
Maximum 9.5255
Zeros (%) 0.0%

Quantile statistics

Minimum 1.5581
5-th percentile 4.0943
Q1 5.8579
Median 6.6399
Q3 7.2189
95-th percentile 7.9233
Maximum 9.5255
Range 7.9674
Interquartile range 1.361

Descriptive statistics

Standard deviation 1.172
Coef of variation 0.18238
Kurtosis 2.0266
Mean 6.4263
MAD 0.88427
Skewness -1.1714
Sum 22993
Variance 1.3737
Memory size 39.1 KiB
Value Count Frequency (%)  
6.380122536899765 21 0.4%
 
5.940171252720432 20 0.4%
 
5.298317366548036 20 0.4%
 
3.8066624897703196 19 0.4%
 
5.272999558563747 19 0.4%
 
6.214608098422191 19 0.4%
 
6.075346031088684 18 0.4%
 
5.68697535633982 18 0.4%
 
5.799092654460526 18 0.4%
 
5.393627546352362 18 0.4%
 
Other values (686) 3388 67.8%
 
(Missing) 1422 28.4%
 

Minimum 5 values

Value Count Frequency (%)  
1.55814461804655 1 0.0%
 
1.6094379124341003 17 0.3%
 
1.6582280766035324 1 0.0%
 
2.0476928433652555 1 0.0%
 
2.0794415416798357 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
8.770283819098399 1 0.0%
 
8.869960510523953 1 0.0%
 
8.896998552743824 1 0.0%
 
9.20230820027892 1 0.0%
 
9.525516008736886 1 0.0%
 

lncreddebt
Numeric

Distinct count 4942
Unique (%) 98.8%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean -0.13045
Minimum -6.5973
Maximum 4.692
Zeros (%) 0.0%

Quantile statistics

Minimum -6.5973
5-th percentile -2.2916
Q1 -0.95269
Median -0.076106
Q3 0.72467
95-th percentile 1.8523
Maximum 4.692
Range 11.289
Interquartile range 1.6774

Descriptive statistics

Standard deviation 1.2731
Coef of variation -9.7587
Kurtosis 0.49262
Mean -0.13045
MAD 0.99857
Skewness -0.29509
Sum -652.14
Variance 1.6207
Memory size 39.1 KiB
Value Count Frequency (%)  
0.22633844221072896 2 0.0%
 
-0.5028574541168138 2 0.0%
 
-0.981229333033066 2 0.0%
 
-1.7618900356257774 2 0.0%
 
-0.12284582999884598 2 0.0%
 
-0.9930637906800688 2 0.0%
 
-1.151987749259985 2 0.0%
 
0.01744691360372049 2 0.0%
 
-2.041929980602883 2 0.0%
 
-1.4444659939752587 2 0.0%
 
Other values (4931) 4979 99.6%
 

Minimum 5 values

Value Count Frequency (%)  
-6.597333719560867 1 0.0%
 
-5.681042987686712 1 0.0%
 
-5.310389947782306 1 0.0%
 
-5.06024579464959 1 0.0%
 
-4.944145552827423 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
3.74001211059991 1 0.0%
 
3.789755027436595 1 0.0%
 
3.8857719210512522 1 0.0%
 
4.211992033134444 1 0.0%
 
4.692013678885976 1 0.0%
 

lnequipmon
Highly correlated

This variable is highly correlated with equipmon and should be ignored for analysis

Correlation 0.97931

lnequipten
Highly correlated

This variable is highly correlated with lntollten and should be ignored for analysis

Correlation 0.96611

lninc
Numeric

Distinct count 266
Unique (%) 5.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.6999
Minimum 2.1972
Maximum 6.9782
Zeros (%) 0.0%

Quantile statistics

Minimum 2.1972
5-th percentile 2.5649
Q1 3.1781
Median 3.6376
Q3 4.2047
95-th percentile 4.9904
Maximum 6.9782
Range 4.781
Interquartile range 1.0266

Descriptive statistics

Standard deviation 0.74707
Coef of variation 0.20192
Kurtosis -0.023788
Mean 3.6999
MAD 0.60138
Skewness 0.35028
Sum 18500
Variance 0.55812
Memory size 39.1 KiB
Value Count Frequency (%)  
3.091042453358316 112 2.2%
 
3.367295829986474 109 2.2%
 
3.2188758248682006 108 2.2%
 
3.4011973816621555 102 2.0%
 
2.995732273553991 102 2.0%
 
3.1354942159291497 100 2.0%
 
2.8903717578961645 100 2.0%
 
3.1780538303479458 99 2.0%
 
3.4657359027997265 93 1.9%
 
3.044522437723423 91 1.8%
 
Other values (256) 3984 79.7%
 

Minimum 5 values

Value Count Frequency (%)  
2.1972245773362196 83 1.7%
 
2.302585092994046 55 1.1%
 
2.3978952727983707 57 1.1%
 
2.4849066497880004 52 1.0%
 
2.5649493574615367 56 1.1%
 

Maximum 5 values

Value Count Frequency (%)  
6.354370040797351 1 0.0%
 
6.464588303689961 1 0.0%
 
6.659293919683638 1 0.0%
 
6.902742737158593 1 0.0%
 
6.9782137426306985 1 0.0%
 

lnlongmon
Numeric

Distinct count 866
Unique (%) 17.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.2888
Minimum -0.10536
Maximum 5.1921
Zeros (%) 0.0%

Quantile statistics

Minimum -0.10536
5-th percentile 1.0647
Q1 1.7405
Median 2.2565
Q3 2.8064
95-th percentile 3.6043
Maximum 5.1921
Range 5.2975
Interquartile range 1.0659

Descriptive statistics

Standard deviation 0.77518
Coef of variation 0.33869
Kurtosis -0.08563
Mean 2.2888
MAD 0.62183
Skewness 0.17355
Sum 11444
Variance 0.6009
Memory size 39.1 KiB
Value Count Frequency (%)  
1.4350845252893227 31 0.6%
 
1.6094379124341003 29 0.6%
 
2.066862759472976 28 0.6%
 
1.7316555451583497 25 0.5%
 
2.0149030205422647 25 0.5%
 
1.55814461804655 24 0.5%
 
1.3737155789130306 24 0.5%
 
1.6389967146756448 24 0.5%
 
1.599387576580599 24 0.5%
 
1.5151272329628591 23 0.5%
 
Other values (856) 4743 94.9%
 

Minimum 5 values

Value Count Frequency (%)  
-0.10536051565782628 2 0.0%
 
-0.05129329438755058 1 0.0%
 
0.0 1 0.0%
 
0.04879016416943205 1 0.0%
 
0.09531017980432493 2 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.7340030578310595 1 0.0%
 
4.741011408899505 1 0.0%
 
4.867534450455582 1 0.0%
 
4.94306997460049 1 0.0%
 
5.192123170141633 1 0.0%
 

lnlongten
Highly correlated

This variable is highly correlated with lnlongmon and should be ignored for analysis

Correlation 0.92171

lnothdebt
Numeric

Distinct count 4973
Unique (%) 99.5%
Missing (%) 0.0%
Missing (n) 1
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.69692
Minimum -4.0921
Maximum 4.952
Zeros (%) 0.0%

Quantile statistics

Minimum -4.0921
5-th percentile -1.2435
Q1 -0.018987
Median 0.74154
Q3 1.4621
95-th percentile 2.4696
Maximum 4.952
Range 9.0441
Interquartile range 1.481

Descriptive statistics

Standard deviation 1.1286
Coef of variation 1.6194
Kurtosis 0.30242
Mean 0.69692
MAD 0.89086
Skewness -0.23209
Sum 3483.9
Variance 1.2737
Memory size 39.1 KiB
Value Count Frequency (%)  
0.10690811750408048 3 0.1%
 
0.32082183403905973 2 0.0%
 
0.586545895465986 2 0.0%
 
-1.70683025844275 2 0.0%
 
0.19724957737819138 2 0.0%
 
1.0522751134597443 2 0.0%
 
-0.1026973982383425 2 0.0%
 
-0.8003093181643999 2 0.0%
 
0.6127394062576954 2 0.0%
 
-1.0346014067076095 2 0.0%
 
Other values (4962) 4978 99.6%
 

Minimum 5 values

Value Count Frequency (%)  
-4.092107067281908 1 0.0%
 
-3.880718155599734 1 0.0%
 
-3.8532825104055455 1 0.0%
 
-3.7984264988456053 1 0.0%
 
-3.3090740927863496 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
4.08389414634726 1 0.0%
 
4.092846640897909 1 0.0%
 
4.203615920555767 1 0.0%
 
4.627991820121045 1 0.0%
 
4.952010982824063 1 0.0%
 

lntollmon
Highly correlated

This variable is highly correlated with tollmon and should be ignored for analysis

Correlation 0.93783

lntollten
Highly correlated

This variable is highly correlated with lnlongten and should be ignored for analysis

Correlation 0.93139

lnwiremon
Highly correlated

This variable is highly correlated with wiremon and should be ignored for analysis

Correlation 0.95389

lnwireten
Highly correlated

This variable is highly correlated with lnequipten and should be ignored for analysis

Correlation 0.98318

longmon
Numeric

Distinct count 866
Unique (%) 17.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 13.471
Minimum 0.9
Maximum 179.85
Zeros (%) 0.0%

Quantile statistics

Minimum 0.9
5-th percentile 2.9
Q1 5.7
Median 9.55
Q3 16.55
95-th percentile 36.758
Maximum 179.85
Range 178.95
Interquartile range 10.85

Descriptive statistics

Standard deviation 12.773
Coef of variation 0.94818
Kurtosis 18.503
Mean 13.471
MAD 8.4049
Skewness 3.2719
Sum 67357
Variance 163.16
Memory size 39.1 KiB
Value Count Frequency (%)  
4.2 31 0.6%
 
5.0 29 0.6%
 
7.9 28 0.6%
 
5.65 25 0.5%
 
7.5 25 0.5%
 
5.15 24 0.5%
 
4.75 24 0.5%
 
3.95 24 0.5%
 
4.95 24 0.5%
 
4.3 23 0.5%
 
Other values (856) 4743 94.9%
 

Minimum 5 values

Value Count Frequency (%)  
0.9 2 0.0%
 
0.95 1 0.0%
 
1.0 1 0.0%
 
1.05 1 0.0%
 
1.1 2 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
113.75 1 0.0%
 
114.55 1 0.0%
 
130.0 1 0.0%
 
140.2 1 0.0%
 
179.85 1 0.0%
 

longten
Highly correlated

This variable is highly correlated with longmon and should be ignored for analysis

Correlation 0.9857

marital
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4802
0
2599
1
2401
Value Count Frequency (%)  
0 2599 52.0%
 
1 2401 48.0%
 

multline
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4884
0
2558
1
2442
Value Count Frequency (%)  
0 2558 51.2%
 
1 2442 48.8%
 

news
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4726
0
2637
1
2363
Value Count Frequency (%)  
0 2637 52.7%
 
1 2363 47.3%
 

othdebt
Numeric

Distinct count 4973
Unique (%) 99.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.6545
Minimum 0
Maximum 141.46
Zeros (%) 0.0%

Quantile statistics

Minimum 0
5-th percentile 0.28769
Q1 0.9803
Median 2.0985
Q3 4.3148
95-th percentile 11.816
Maximum 141.46
Range 141.46
Interquartile range 3.3345

Descriptive statistics

Standard deviation 5.3952
Coef of variation 1.4763
Kurtosis 125.15
Mean 3.6545
MAD 2.972
Skewness 7.5899
Sum 18272
Variance 29.108
Memory size 39.1 KiB
Value Count Frequency (%)  
1.112832 3 0.1%
 
4.6926000000000005 2 0.0%
 
4.45536 2 0.0%
 
1.218048 2 0.0%
 
1.131624 2 0.0%
 
0.18144 2 0.0%
 
0.531696 2 0.0%
 
2.86416 2 0.0%
 
0.355368 2 0.0%
 
2.2800960000000003 2 0.0%
 
Other values (4963) 4979 99.6%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 1 0.0%
 
0.016704 1 0.0%
 
0.020636 1 0.0%
 
0.02121000000000001 1 0.0%
 
0.022406 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
59.376239999999996 1 0.0%
 
59.910192 1 0.0%
 
66.9279 1 0.0%
 
102.308404 1 0.0%
 
141.45915 1 0.0%
 

owncd
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.9328
1
4664
0
 
336
Value Count Frequency (%)  
1 4664 93.3%
 
0 336 6.7%
 

owndvd
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.9136
1
4568
0
 
432
Value Count Frequency (%)  
1 4568 91.4%
 
0 432 8.6%
 

ownfax
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1788
0
4106
1
894
Value Count Frequency (%)  
0 4106 82.1%
 
1 894 17.9%
 

owngame
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4748
0
2626
1
2374
Value Count Frequency (%)  
0 2626 52.5%
 
1 2374 47.5%
 

ownipod
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4792
0
2604
1
2396
Value Count Frequency (%)  
0 2604 52.1%
 
1 2396 47.9%
 

ownpc
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.6328
1
3164
0
1836
Value Count Frequency (%)  
1 3164 63.3%
 
0 1836 36.7%
 

ownpda
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.201
0
3995
1
1005
Value Count Frequency (%)  
0 3995 79.9%
 
1 1005 20.1%
 

owntv
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.983
1
4915
0
 
85
Value Count Frequency (%)  
1 4915 98.3%
 
0 85 1.7%
 

ownvcr
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.9156
1
4578
0
 
422
Value Count Frequency (%)  
1 4578 91.6%
 
0 422 8.4%
 

pager
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2436
0
3782
1
1218
Value Count Frequency (%)  
0 3782 75.6%
 
1 1218 24.4%
 

pets
Numeric

Distinct count 20
Unique (%) 0.4%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.0674
Minimum 0
Maximum 21
Zeros (%) 30.6%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 2
Q3 5
95-th percentile 10
Maximum 21
Range 21
Interquartile range 5

Descriptive statistics

Standard deviation 3.4145
Coef of variation 1.1132
Kurtosis 0.89599
Mean 3.0674
MAD 2.7814
Skewness 1.1938
Sum 15337
Variance 11.659
Memory size 39.1 KiB
Value Count Frequency (%)  
0 1529 30.6%
 
1 780 15.6%
 
2 586 11.7%
 
3 376 7.5%
 
5 298 6.0%
 
4 284 5.7%
 
6 256 5.1%
 
7 246 4.9%
 
8 178 3.6%
 
9 170 3.4%
 
Other values (10) 297 5.9%
 

Minimum 5 values

Value Count Frequency (%)  
0 1529 30.6%
 
1 780 15.6%
 
2 586 11.7%
 
3 376 7.5%
 
4 284 5.7%
 

Maximum 5 values

Value Count Frequency (%)  
15 10 0.2%
 
16 9 0.2%
 
18 1 0.0%
 
19 3 0.1%
 
21 1 0.0%
 

pets_birds
Numeric

Distinct count 6
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.1104
Minimum 0
Maximum 5
Zeros (%) 94.0%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 1
Maximum 5
Range 5
Interquartile range 0

Descriptive statistics

Standard deviation 0.49423
Coef of variation 4.4767
Kurtosis 29.823
Mean 0.1104
MAD 0.20746
Skewness 5.2464
Sum 552
Variance 0.24426
Memory size 39.1 KiB
Value Count Frequency (%)  
0 4698 94.0%
 
1 144 2.9%
 
2 88 1.8%
 
3 49 1.0%
 
4 20 0.4%
 
5 1 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
0 4698 94.0%
 
1 144 2.9%
 
2 88 1.8%
 
3 49 1.0%
 
4 20 0.4%
 

Maximum 5 values

Value Count Frequency (%)  
1 144 2.9%
 
2 88 1.8%
 
3 49 1.0%
 
4 20 0.4%
 
5 1 0.0%
 

pets_cats
Numeric

Distinct count 7
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.5004
Minimum 0
Maximum 6
Zeros (%) 68.3%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 1
95-th percentile 2
Maximum 6
Range 6
Interquartile range 1

Descriptive statistics

Standard deviation 0.86078
Coef of variation 1.7202
Kurtosis 3.5056
Mean 0.5004
MAD 0.68315
Skewness 1.8804
Sum 2502
Variance 0.74095
Memory size 39.1 KiB
Value Count Frequency (%)  
0 3413 68.3%
 
1 923 18.5%
 
2 463 9.3%
 
3 160 3.2%
 
4 34 0.7%
 
5 5 0.1%
 
6 2 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
0 3413 68.3%
 
1 923 18.5%
 
2 463 9.3%
 
3 160 3.2%
 
4 34 0.7%
 

Maximum 5 values

Value Count Frequency (%)  
2 463 9.3%
 
3 160 3.2%
 
4 34 0.7%
 
5 5 0.1%
 
6 2 0.0%
 

pets_dogs
Numeric

Distinct count 7
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.3924
Minimum 0
Maximum 7
Zeros (%) 75.2%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 2
Maximum 7
Range 7
Interquartile range 0

Descriptive statistics

Standard deviation 0.79608
Coef of variation 2.0288
Kurtosis 5.8784
Mean 0.3924
MAD 0.59048
Skewness 2.3301
Sum 1962
Variance 0.63375
Memory size 39.1 KiB
Value Count Frequency (%)  
0 3762 75.2%
 
1 720 14.4%
 
2 360 7.2%
 
3 117 2.3%
 
4 36 0.7%
 
5 4 0.1%
 
7 1 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
0 3762 75.2%
 
1 720 14.4%
 
2 360 7.2%
 
3 117 2.3%
 
4 36 0.7%
 

Maximum 5 values

Value Count Frequency (%)  
2 360 7.2%
 
3 117 2.3%
 
4 36 0.7%
 
5 4 0.1%
 
7 1 0.0%
 

pets_freshfish
Numeric

Distinct count 17
Unique (%) 0.3%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 1.8474
Minimum 0
Maximum 16
Zeros (%) 69.2%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 4
95-th percentile 8
Maximum 16
Range 16
Interquartile range 4

Descriptive statistics

Standard deviation 3.0748
Coef of variation 1.6644
Kurtosis 1.0344
Mean 1.8474
MAD 2.564
Skewness 1.4552
Sum 9237
Variance 9.4544
Memory size 39.1 KiB
Value Count Frequency (%)  
0 3462 69.2%
 
5 261 5.2%
 
6 251 5.0%
 
7 229 4.6%
 
4 222 4.4%
 
8 134 2.7%
 
3 130 2.6%
 
9 110 2.2%
 
2 63 1.3%
 
10 54 1.1%
 
Other values (7) 84 1.7%
 

Minimum 5 values

Value Count Frequency (%)  
0 3462 69.2%
 
1 17 0.3%
 
2 63 1.3%
 
3 130 2.6%
 
4 222 4.4%
 

Maximum 5 values

Value Count Frequency (%)  
12 18 0.4%
 
13 4 0.1%
 
14 4 0.1%
 
15 5 0.1%
 
16 1 0.0%
 

pets_reptiles
Numeric

Distinct count 7
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.0556
Minimum 0
Maximum 6
Zeros (%) 96.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 0
Maximum 6
Range 6
Interquartile range 0

Descriptive statistics

Standard deviation 0.32578
Coef of variation 5.8593
Kurtosis 72.829
Mean 0.0556
MAD 0.10715
Skewness 7.5861
Sum 278
Variance 0.10613
Memory size 39.1 KiB
Value Count Frequency (%)  
0 4818 96.4%
 
1 114 2.3%
 
2 46 0.9%
 
3 19 0.4%
 
6 1 0.0%
 
5 1 0.0%
 
4 1 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
0 4818 96.4%
 
1 114 2.3%
 
2 46 0.9%
 
3 19 0.4%
 
4 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
2 46 0.9%
 
3 19 0.4%
 
4 1 0.0%
 
5 1 0.0%
 
6 1 0.0%
 

pets_saltfish
Numeric

Distinct count 9
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.0466
Minimum 0
Maximum 8
Zeros (%) 98.8%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 0
Maximum 8
Range 8
Interquartile range 0

Descriptive statistics

Standard deviation 0.46955
Coef of variation 10.076
Kurtosis 133.34
Mean 0.0466
MAD 0.092119
Skewness 11.194
Sum 233
Variance 0.22047
Memory size 39.1 KiB
Value Count Frequency (%)  
0 4942 98.8%
 
3 11 0.2%
 
6 11 0.2%
 
2 11 0.2%
 
4 10 0.2%
 
5 8 0.2%
 
7 3 0.1%
 
1 3 0.1%
 
8 1 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
0 4942 98.8%
 
1 3 0.1%
 
2 11 0.2%
 
3 11 0.2%
 
4 10 0.2%
 

Maximum 5 values

Value Count Frequency (%)  
4 10 0.2%
 
5 8 0.2%
 
6 11 0.2%
 
7 3 0.1%
 
8 1 0.0%
 

pets_small
Numeric

Distinct count 8
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 0.1146
Minimum 0
Maximum 7
Zeros (%) 95.0%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 0
95-th percentile 1
Maximum 7
Range 7
Interquartile range 0

Descriptive statistics

Standard deviation 0.5688
Coef of variation 4.9633
Kurtosis 39.154
Mean 0.1146
MAD 0.21769
Skewness 5.9087
Sum 573
Variance 0.32353
Memory size 39.1 KiB
Value Count Frequency (%)  
0 4749 95.0%
 
2 83 1.7%
 
1 78 1.6%
 
3 50 1.0%
 
4 26 0.5%
 
5 10 0.2%
 
6 3 0.1%
 
7 1 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
0 4749 95.0%
 
1 78 1.6%
 
2 83 1.7%
 
3 50 1.0%
 
4 26 0.5%
 

Maximum 5 values

Value Count Frequency (%)  
3 50 1.0%
 
4 26 0.5%
 
5 10 0.2%
 
6 3 0.1%
 
7 1 0.0%
 

polcontrib
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2384
0
3808
1
1192
Value Count Frequency (%)  
0 3808 76.2%
 
1 1192 23.8%
 

polparty
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.3814
0
3093
1
1907
Value Count Frequency (%)  
0 3093 61.9%
 
1 1907 38.1%
 

polview
Numeric

Distinct count 7
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 4.0886
Minimum 1
Maximum 7
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 2
Q1 3
Median 4
Q3 5
95-th percentile 6
Maximum 7
Range 6
Interquartile range 2

Descriptive statistics

Standard deviation 1.3871
Coef of variation 0.33925
Kurtosis -0.5312
Mean 4.0886
MAD 1.0702
Skewness -0.19834
Sum 20443
Variance 1.9239
Memory size 39.1 KiB
Value Count Frequency (%)  
4 1733 34.7%
 
5 893 17.9%
 
6 843 16.9%
 
3 659 13.2%
 
2 623 12.5%
 
1 163 3.3%
 
7 86 1.7%
 

Minimum 5 values

Value Count Frequency (%)  
1 163 3.3%
 
2 623 12.5%
 
3 659 13.2%
 
4 1733 34.7%
 
5 893 17.9%
 

Maximum 5 values

Value Count Frequency (%)  
3 659 13.2%
 
4 1733 34.7%
 
5 893 17.9%
 
6 843 16.9%
 
7 86 1.7%
 

reason
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 7.6368
Minimum 1
Maximum 9
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 9
Median 9
Q3 9
95-th percentile 9
Maximum 9
Range 8
Interquartile range 0

Descriptive statistics

Standard deviation 2.85
Coef of variation 0.37319
Kurtosis 0.84805
Mean 7.6368
MAD 2.2095
Skewness -1.6586
Sum 38184
Variance 8.1225
Memory size 39.1 KiB
Value Count Frequency (%)  
9 4052 81.0%
 
1 447 8.9%
 
2 339 6.8%
 
4 105 2.1%
 
3 57 1.1%
 

Minimum 5 values

Value Count Frequency (%)  
1 447 8.9%
 
2 339 6.8%
 
3 57 1.1%
 
4 105 2.1%
 
9 4052 81.0%
 

Maximum 5 values

Value Count Frequency (%)  
1 447 8.9%
 
2 339 6.8%
 
3 57 1.1%
 
4 105 2.1%
 
9 4052 81.0%
 

region
Numeric

Distinct count 5
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 3.0014
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 2
Median 3
Q3 4
95-th percentile 5
Maximum 5
Range 4
Interquartile range 2

Descriptive statistics

Standard deviation 1.4218
Coef of variation 0.4737
Kurtosis -1.309
Mean 3.0014
MAD 1.2069
Skewness 0.0050525
Sum 15007
Variance 2.0214
Memory size 39.1 KiB
Value Count Frequency (%)  
5 1027 20.5%
 
1 1009 20.2%
 
3 1003 20.1%
 
2 995 19.9%
 
4 966 19.3%
 

Minimum 5 values

Value Count Frequency (%)  
1 1009 20.2%
 
2 995 19.9%
 
3 1003 20.1%
 
4 966 19.3%
 
5 1027 20.5%
 

Maximum 5 values

Value Count Frequency (%)  
1 1009 20.2%
 
2 995 19.9%
 
3 1003 20.1%
 
4 966 19.3%
 
5 1027 20.5%
 

reside
Numeric

Distinct count 9
Unique (%) 0.2%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.204
Minimum 1
Maximum 9
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 2
Q3 3
95-th percentile 5
Maximum 9
Range 8
Interquartile range 2

Descriptive statistics

Standard deviation 1.394
Coef of variation 0.63248
Kurtosis 1.0075
Mean 2.204
MAD 1.0998
Skewness 1.2288
Sum 11020
Variance 1.9432
Memory size 39.1 KiB
Value Count Frequency (%)  
1 2035 40.7%
 
2 1467 29.3%
 
3 552 11.0%
 
4 521 10.4%
 
5 288 5.8%
 
6 99 2.0%
 
7 29 0.6%
 
8 7 0.1%
 
9 2 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
1 2035 40.7%
 
2 1467 29.3%
 
3 552 11.0%
 
4 521 10.4%
 
5 288 5.8%
 

Maximum 5 values

Value Count Frequency (%)  
5 288 5.8%
 
6 99 2.0%
 
7 29 0.6%
 
8 7 0.1%
 
9 2 0.0%
 

response_01
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.0836
0
4582
1
 
418
Value Count Frequency (%)  
0 4582 91.6%
 
1 418 8.4%
 

response_02
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1298
0
4351
1
 
649
Value Count Frequency (%)  
0 4351 87.0%
 
1 649 13.0%
 

response_03
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1026
0
4487
1
 
513
Value Count Frequency (%)  
0 4487 89.7%
 
1 513 10.3%
 

retire
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1476
0
4262
1
 
738
Value Count Frequency (%)  
0 4262 85.2%
 
1 738 14.8%
 

spoused
Highly correlated

This variable is highly correlated with marital and should be ignored for analysis

Correlation 0.95577

spousedcat
Highly correlated

This variable is highly correlated with spoused and should be ignored for analysis

Correlation 0.98403

telecommute
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.188
0
4060
1
940
Value Count Frequency (%)  
0 4060 81.2%
 
1 940 18.8%
 

tenure
Highly correlated

This variable is highly correlated with card2tenure and should be ignored for analysis

Correlation 0.92824

tollfree
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.4756
0
2622
1
2378
Value Count Frequency (%)  
0 2622 52.4%
 
1 2378 47.6%
 

tollmon
Numeric

Distinct count 235
Unique (%) 4.7%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 13.264
Minimum 0
Maximum 173
Zeros (%) 52.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 24.5
95-th percentile 43.5
Maximum 173
Range 173
Interquartile range 24.5

Descriptive statistics

Standard deviation 16.31
Coef of variation 1.2296
Kurtosis 2.3195
Mean 13.264
MAD 13.992
Skewness 1.1816
Sum 66322
Variance 266.02
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 2622 52.4%
 
22.75 33 0.7%
 
18.0 33 0.7%
 
24.0 32 0.6%
 
23.0 31 0.6%
 
22.0 30 0.6%
 
23.75 30 0.6%
 
20.0 29 0.6%
 
19.0 29 0.6%
 
19.5 29 0.6%
 
Other values (225) 2102 42.0%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 2622 52.4%
 
8.0 1 0.0%
 
8.5 2 0.0%
 
8.75 2 0.0%
 
9.0 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
86.0 1 0.0%
 
94.75 1 0.0%
 
99.25 1 0.0%
 
101.75 1 0.0%
 
173.0 1 0.0%
 

tollten
Numeric

Distinct count 2323
Unique (%) 46.5%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 577.83
Minimum 0
Maximum 6923.4
Zeros (%) 52.4%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 885.45
95-th percentile 2620.2
Maximum 6923.4
Range 6923.4
Interquartile range 885.45

Descriptive statistics

Standard deviation 949.15
Coef of variation 1.6426
Kurtosis 4.8793
Mean 577.83
MAD 712.02
Skewness 2.0899
Sum 2889200
Variance 900890
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 2622 52.4%
 
16.75 3 0.1%
 
1480.5 3 0.1%
 
10.0 3 0.1%
 
68.65 2 0.0%
 
423.2 2 0.0%
 
478.95 2 0.0%
 
17.75 2 0.0%
 
1325.8 2 0.0%
 
481.5 2 0.0%
 
Other values (2313) 2357 47.1%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 2622 52.4%
 
8.75 1 0.0%
 
10.0 3 0.1%
 
10.5 1 0.0%
 
10.75 1 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
5646.45 1 0.0%
 
6196.5 1 0.0%
 
6763.8 1 0.0%
 
6843.0 1 0.0%
 
6923.45 1 0.0%
 

totalspend
Highly correlated

This variable is highly correlated with cardspent and should be ignored for analysis

Correlation 0.94149

townsize
Numeric

Distinct count 6
Unique (%) 0.1%
Missing (%) 0.0%
Missing (n) 2
Infinite (%) 0.0%
Infinite (n) 0
Mean 2.6873
Minimum 1
Maximum 5
Zeros (%) 0.0%

Quantile statistics

Minimum 1
5-th percentile 1
Q1 1
Median 3
Q3 4
95-th percentile 5
Maximum 5
Range 4
Interquartile range 3

Descriptive statistics

Standard deviation 1.4259
Coef of variation 0.53062
Kurtosis -1.2628
Mean 2.6873
MAD 1.2578
Skewness 0.27662
Sum 13431
Variance 2.0333
Memory size 39.1 KiB
Value Count Frequency (%)  
1.0 1436 28.7%
 
2.0 1048 21.0%
 
3.0 907 18.1%
 
4.0 857 17.1%
 
5.0 750 15.0%
 
(Missing) 2 0.0%
 

Minimum 5 values

Value Count Frequency (%)  
1.0 1436 28.7%
 
2.0 1048 21.0%
 
3.0 907 18.1%
 
4.0 857 17.1%
 
5.0 750 15.0%
 

Maximum 5 values

Value Count Frequency (%)  
1.0 1436 28.7%
 
2.0 1048 21.0%
 
3.0 907 18.1%
 
4.0 857 17.1%
 
5.0 750 15.0%
 

union
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.1512
0
4244
1
 
756
Value Count Frequency (%)  
0 4244 84.9%
 
1 756 15.1%
 

voice
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.303
0
3485
1
1515
Value Count Frequency (%)  
0 3485 69.7%
 
1 1515 30.3%
 

vote
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.518
1
2590
0
2410
Value Count Frequency (%)  
1 2590 51.8%
 
0 2410 48.2%
 

wireless
Boolean

Distinct count 2
Unique (%) 0.0%
Missing (%) 0.0%
Missing (n) 0
Mean 0.2688
0
3656
1
1344
Value Count Frequency (%)  
0 3656 73.1%
 
1 1344 26.9%
 

wiremon
Numeric

Distinct count 746
Unique (%) 14.9%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 10.701
Minimum 0
Maximum 186.25
Zeros (%) 73.1%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 20.962
95-th percentile 51.305
Maximum 186.25
Range 186.25
Interquartile range 20.962

Descriptive statistics

Standard deviation 19.8
Coef of variation 1.8502
Kurtosis 4.7838
Mean 10.701
MAD 15.649
Skewness 1.998
Sum 53506
Variance 392.03
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 3656 73.1%
 
34.3 9 0.2%
 
22.95 7 0.1%
 
27.95 7 0.1%
 
32.2 7 0.1%
 
39.3 6 0.1%
 
22.5 6 0.1%
 
31.15 5 0.1%
 
43.2 5 0.1%
 
27.75 5 0.1%
 
Other values (736) 1287 25.7%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3656 73.1%
 
12.7 1 0.0%
 
14.0 1 0.0%
 
14.55 1 0.0%
 
14.7 2 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
119.1 1 0.0%
 
130.75 1 0.0%
 
134.45 1 0.0%
 
165.15 1 0.0%
 
186.25 1 0.0%
 

wireten
Numeric

Distinct count 1328
Unique (%) 26.6%
Missing (%) 0.0%
Missing (n) 0
Infinite (%) 0.0%
Infinite (n) 0
Mean 421.98
Minimum 0
Maximum 12859
Zeros (%) 73.1%

Quantile statistics

Minimum 0
5-th percentile 0
Q1 0
Median 0
Q3 89.962
95-th percentile 2687.9
Maximum 12859
Range 12859
Interquartile range 89.962

Descriptive statistics

Standard deviation 1001
Coef of variation 2.3721
Kurtosis 15.464
Mean 421.98
MAD 646.82
Skewness 3.3042
Sum 2109900
Variance 1002000
Memory size 39.1 KiB
Value Count Frequency (%)  
0.0 3656 73.1%
 
2182.05 2 0.0%
 
2386.25 2 0.0%
 
2323.8 2 0.0%
 
1062.75 2 0.0%
 
2049.85 2 0.0%
 
1199.2 2 0.0%
 
183.1 2 0.0%
 
20.9 2 0.0%
 
1073.95 2 0.0%
 
Other values (1318) 1326 26.5%
 

Minimum 5 values

Value Count Frequency (%)  
0.0 3656 73.1%
 
12.7 1 0.0%
 
14.55 1 0.0%
 
14.6 1 0.0%
 
14.9 2 0.0%
 

Maximum 5 values

Value Count Frequency (%)  
7217.1 1 0.0%
 
8675.05 1 0.0%
 
9039.8 1 0.0%
 
9851.15 1 0.0%
 
12858.65 1 0.0%
 

Correlations

Sample

custid region townsize gender age agecat birthmonth ed edcat jobcat union employ empcat retire income lninc inccat debtinc creddebt lncreddebt othdebt lnothdebt default jobsat marital spoused spousedcat reside pets pets_cats pets_dogs pets_birds pets_reptiles pets_small pets_saltfish pets_freshfish homeown hometype address addresscat cars carown cartype carvalue carcatvalue carbought carbuy commute commutecat commutetime commutecar commutemotorcycle commutecarpool commutebus commuterail commutepublic commutebike commutewalk commutenonmotor telecommute reason polview polparty polcontrib vote card cardtype cardbenefit cardfee cardtenure cardtenurecat card2 card2type card2benefit card2fee card2tenure card2tenurecat cardspent card2spent active bfast tenure churn longmon lnlongmon longten lnlongten tollfree tollmon lntollmon tollten lntollten equip equipmon lnequipmon equipten lnequipten callcard cardmon lncardmon cardten lncardten wireless wiremon lnwiremon wireten lnwireten multline voice pager internet callid callwait forward confer ebill owntv hourstv ownvcr owndvd owncd ownpda ownpc ownipod owngame ownfax news response_01 response_02 response_03 totalspend
0 3964-QJWTRG-NPN 1 2.0 1 20 2 September 15 3 1 1 0 1 0 31 3.433987 2 11.1 1.200909 0.183079 2.240091 0.806516 1 1 0 -1 -1 3 0 0 0 0 0 0 0 0 0 2 0 1 2 1 0 14.3 1 0 0 8 4 22.0 0 1 1 0 0 0 0 1 0 0 9 6 1 0 1 3 1 1 0 2 2 5 3 1 0 3 2 81.66 67.80 0 3 5 1 6.50 1.871802 34.40 3.538057 1 29.0 3.367296 161.05 5.081715 1 29.50 3.384390 126.1 4.837075 1 14.25 2.656757 60.0 4.094345 0 0.00 NaN 0.00 NaN 1 1 1 0 0 1 1 1 0 1 13 1 1 0 0 0 1 1 0 0 0 1 0 149.46
1 0648-AIPJSP-UVM 5 5.0 0 22 2 May 17 4 2 0 0 1 0 15 2.708050 1 18.6 1.222020 0.200505 1.567980 0.449788 1 1 0 -1 -1 2 6 0 0 0 0 0 0 6 1 3 2 1 2 1 1 6.8 1 0 0 1 1 29.0 1 0 0 1 0 0 1 0 1 1 9 4 1 0 0 2 4 1 0 4 2 4 1 3 0 4 2 42.60 34.94 1 1 39 0 8.90 2.186051 330.60 5.800909 0 0.0 NaN 0.00 NaN 1 54.85 4.004602 1975.0 7.588324 1 16.00 2.772589 610.0 6.413459 1 45.65 3.821004 1683.55 7.428660 1 1 1 4 1 0 1 0 1 1 18 1 1 1 1 1 1 1 1 1 0 0 0 77.54
2 5195-TLUDJE-HVO 3 4.0 1 67 6 June 14 2 2 0 16 5 0 35 3.555348 2 9.9 0.928620 -0.074056 2.536380 0.930738 0 4 1 13 2 3 3 2 1 0 0 0 0 0 1 1 30 5 3 1 1 18.8 1 0 1 4 3 24.0 1 0 1 1 1 0 0 0 0 0 2 5 1 0 0 2 1 4 0 35 5 4 1 3 0 25 5 184.22 175.75 0 3 65 0 28.40 3.346389 1858.35 7.527444 0 0.0 NaN 0.00 NaN 0 0.00 NaN 0.0 NaN 1 23.00 3.135494 1410.0 7.251345 0 0.00 NaN 0.00 NaN 1 0 0 0 0 0 0 0 0 1 21 1 1 1 0 0 0 0 0 1 0 0 0 359.97
3 4459-VLPQUH-3OL 4 3.0 0 23 2 May 16 3 2 0 0 1 0 20 2.995732 1 5.7 0.022800 -3.780995 1.117200 0.110826 1 2 1 18 4 5 0 0 0 0 0 0 0 0 1 3 3 2 3 1 1 8.7 1 0 1 1 1 38.0 1 0 0 0 0 0 0 0 0 0 9 3 0 0 0 2 1 4 0 5 2 3 2 4 0 5 2 340.99 18.42 1 1 36 0 6.00 1.791759 199.45 5.295564 0 0.0 NaN 0.00 NaN 0 0.00 NaN 0.0 NaN 1 21.00 3.044522 685.0 6.529419 0 0.00 NaN 0.00 NaN 1 0 0 2 0 0 0 0 1 1 26 1 1 1 0 1 1 1 0 1 1 0 0 359.41
4 8158-SMTQFB-CNO 2 2.0 0 26 3 July 16 3 2 0 1 1 0 23 3.135494 1 1.7 0.214659 -1.538705 0.176341 -1.735336 0 1 1 13 2 4 0 0 0 0 0 0 0 0 0 2 3 2 1 0 1 10.6 1 0 1 6 3 32.0 0 0 0 0 0 1 0 1 0 0 9 4 0 0 0 4 2 1 0 8 3 1 3 2 0 9 3 255.10 252.73 1 3 21 0 3.05 1.115142 74.10 4.305416 1 16.5 2.803360 387.70 5.960232 0 0.00 NaN 0.0 NaN 1 17.25 2.847812 360.0 5.886104 1 19.05 2.947067 410.80 6.018106 0 1 0 3 1 1 1 1 0 1 27 1 1 1 0 1 0 1 0 0 0 1 0 507.83

Drop Variables

In [7]:
#Drop cardspent(first card spent amount) and card2spent(Second card spent amount) because its not adding any value
#Also  we are droping "custid","birthmonth" bacuse cust id is unique and birth month is not adding any value

custdata_df.drop(["cardspent","card2spent","custid"],axis=1, inplace=True)
In [ ]:
#Item count need to drop promary and secondary Iem count

Check linearlity of data:

  1. Use scatter plot by using 'df.plot' or if you want a linear line you can use 'sns.lmplot' through seaborn.
In [8]:
# Seaborn scatter plot with regression line
# aspect=1.5, scatter_kws={'alpha':0.2}) - You can choose this option too.
sns.lmplot(x='income', y='totalspend', data=custdata_df)
Out[8]:
<seaborn.axisgrid.FacetGrid at 0xba09c88>

So we can see above most data points are near to line which means our data is normal and we can go ahed for further process.

In [9]:
# Creating Categorical List= Contains categorical variables...

for x in ['region','townsize','gender','agecat','edcat','birthmonth','jobcat','union','employ','empcat','retire',
          'inccat','default','jobsat','marital','spousedcat','homeown','hometype','address','addresscat','cars','carown',
          'cartype','carcatvalue','carbought','carbuy','commute','commutecat','commutecar',
          'commutemotorcycle','commutecarpool','commutebus','commuterail','commutepublic','commutebike','commutewalk',
         'commutenonmotor','telecommute','reason','polview','polparty','polcontrib','vote','card','cardtype','cardbenefit',
         'cardfee','cardtenure','cardtenurecat','card2','card2type','card2benefit','card2fee','card2tenure','card2tenurecat',
         'active','bfast','churn','tollfree','equip','callcard','wireless','multline','voice','pager','internet','callid',
         'callwait','forward','confer','ebill','owntv','ownvcr','owndvd','owncd','ownpda','ownpc','ownipod','owngame','ownfax',
        'news','response_01','response_02','response_03']:
    custdata_df[x]=custdata_df[x].astype('object')
In [ ]:
 

Separate numerical and categorical variable

To do that you need to apply for loop along with if condition.

In [10]:
# Find numerical variable in Data frame.
# This will return a list
numeric_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']]

# Find Categorical variable in Data frame
cat_var_names = [key for key in dict(custdata_df.dtypes) if dict(custdata_df.dtypes)[key] in ['object']]

#Print the data frame
print( numeric_var_names)
print(cat_var_names)
['age', 'ed', 'income', 'lninc', 'debtinc', 'creddebt', 'lncreddebt', 'othdebt', 'lnothdebt', 'spoused', 'reside', 'pets', 'pets_cats', 'pets_dogs', 'pets_birds', 'pets_reptiles', 'pets_small', 'pets_saltfish', 'pets_freshfish', 'carvalue', 'commutetime', 'tenure', 'longmon', 'lnlongmon', 'longten', 'lnlongten', 'tollmon', 'lntollmon', 'tollten', 'lntollten', 'equipmon', 'lnequipmon', 'equipten', 'lnequipten', 'cardmon', 'lncardmon', 'cardten', 'lncardten', 'wiremon', 'lnwiremon', 'wireten', 'lnwireten', 'hourstv', 'totalspend']
['region', 'townsize', 'gender', 'agecat', 'birthmonth', 'edcat', 'jobcat', 'union', 'employ', 'empcat', 'retire', 'inccat', 'default', 'jobsat', 'marital', 'spousedcat', 'homeown', 'hometype', 'address', 'addresscat', 'cars', 'carown', 'cartype', 'carcatvalue', 'carbought', 'carbuy', 'commute', 'commutecat', 'commutecar', 'commutemotorcycle', 'commutecarpool', 'commutebus', 'commuterail', 'commutepublic', 'commutebike', 'commutewalk', 'commutenonmotor', 'telecommute', 'reason', 'polview', 'polparty', 'polcontrib', 'vote', 'card', 'cardtype', 'cardbenefit', 'cardfee', 'cardtenure', 'cardtenurecat', 'card2', 'card2type', 'card2benefit', 'card2fee', 'card2tenure', 'card2tenurecat', 'active', 'bfast', 'churn', 'tollfree', 'equip', 'callcard', 'wireless', 'multline', 'voice', 'pager', 'internet', 'callid', 'callwait', 'forward', 'confer', 'ebill', 'owntv', 'ownvcr', 'owndvd', 'owncd', 'ownpda', 'ownpc', 'ownipod', 'owngame', 'ownfax', 'news', 'response_01', 'response_02', 'response_03']
In [11]:
#Convert a list in to dataframe 
#Information about numericvariable in custdata_df

custdata_df_num=custdata_df[numeric_var_names]
custdata_df_num.head(5)
Out[11]:
age ed income lninc debtinc creddebt lncreddebt othdebt lnothdebt spoused reside pets pets_cats pets_dogs pets_birds pets_reptiles pets_small pets_saltfish pets_freshfish carvalue commutetime tenure longmon lnlongmon longten lnlongten tollmon lntollmon tollten lntollten equipmon lnequipmon equipten lnequipten cardmon lncardmon cardten lncardten wiremon lnwiremon wireten lnwireten hourstv totalspend
0 20 15 31 3.433987 11.1 1.200909 0.183079 2.240091 0.806516 -1 3 0 0 0 0 0 0 0 0 14.3 22.0 5 6.50 1.871802 34.40 3.538057 29.0 3.367296 161.05 5.081715 29.50 3.384390 126.1 4.837075 14.25 2.656757 60.0 4.094345 0.00 NaN 0.00 NaN 13 149.46
1 22 17 15 2.708050 18.6 1.222020 0.200505 1.567980 0.449788 -1 2 6 0 0 0 0 0 0 6 6.8 29.0 39 8.90 2.186051 330.60 5.800909 0.0 NaN 0.00 NaN 54.85 4.004602 1975.0 7.588324 16.00 2.772589 610.0 6.413459 45.65 3.821004 1683.55 7.428660 18 77.54
2 67 14 35 3.555348 9.9 0.928620 -0.074056 2.536380 0.930738 13 3 3 2 1 0 0 0 0 0 18.8 24.0 65 28.40 3.346389 1858.35 7.527444 0.0 NaN 0.00 NaN 0.00 NaN 0.0 NaN 23.00 3.135494 1410.0 7.251345 0.00 NaN 0.00 NaN 21 359.97
3 23 16 20 2.995732 5.7 0.022800 -3.780995 1.117200 0.110826 18 5 0 0 0 0 0 0 0 0 8.7 38.0 36 6.00 1.791759 199.45 5.295564 0.0 NaN 0.00 NaN 0.00 NaN 0.0 NaN 21.00 3.044522 685.0 6.529419 0.00 NaN 0.00 NaN 26 359.41
4 26 16 23 3.135494 1.7 0.214659 -1.538705 0.176341 -1.735336 13 4 0 0 0 0 0 0 0 0 10.6 32.0 21 3.05 1.115142 74.10 4.305416 16.5 2.803360 387.70 5.960232 0.00 NaN 0.0 NaN 17.25 2.847812 360.0 5.886104 19.05 2.947067 410.80 6.018106 27 507.83
In [12]:
#Convert a list in to dataframe 
#Information about categorical variable in custdata_df

custdata_df_cat=custdata_df[cat_var_names]
custdata_df_cat.head(5)
Out[12]:
region townsize gender agecat birthmonth edcat jobcat union employ empcat retire inccat default jobsat marital spousedcat homeown hometype address addresscat cars carown cartype carcatvalue carbought carbuy commute commutecat commutecar commutemotorcycle commutecarpool commutebus commuterail commutepublic commutebike commutewalk commutenonmotor telecommute reason polview polparty polcontrib vote card cardtype cardbenefit cardfee cardtenure cardtenurecat card2 card2type card2benefit card2fee card2tenure card2tenurecat active bfast churn tollfree equip callcard wireless multline voice pager internet callid callwait forward confer ebill owntv ownvcr owndvd owncd ownpda ownpc ownipod owngame ownfax news response_01 response_02 response_03
0 1 2 1 2 September 3 1 1 0 1 0 2 1 1 0 -1 0 2 0 1 2 1 0 1 0 0 8 4 0 1 1 0 0 0 0 1 0 0 9 6 1 0 1 3 1 1 0 2 2 5 3 1 0 3 2 0 3 1 1 1 1 0 1 1 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 0 0 0 1 0
1 5 5 0 2 May 4 2 0 0 1 0 1 1 1 0 -1 1 3 2 1 2 1 1 1 0 0 1 1 1 0 0 1 0 0 1 0 1 1 9 4 1 0 0 2 4 1 0 4 2 4 1 3 0 4 2 1 1 0 0 1 1 1 1 1 1 4 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0
2 3 4 1 6 June 2 2 0 16 5 0 2 0 4 1 2 1 1 30 5 3 1 1 1 0 1 4 3 1 0 1 1 1 0 0 0 0 0 2 5 1 0 0 2 1 4 0 35 5 4 1 3 0 25 5 0 3 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0
3 4 3 0 2 May 3 2 0 0 1 0 1 1 2 1 4 1 3 3 2 3 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 9 3 0 0 0 2 1 4 0 5 2 3 2 4 0 5 2 1 1 0 0 0 1 0 1 0 0 2 0 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0
4 2 2 0 3 July 3 2 0 1 1 0 1 0 1 1 2 0 2 3 2 1 0 1 1 0 1 6 3 0 0 0 0 0 1 0 1 0 0 9 4 0 0 0 4 2 1 0 8 3 1 3 2 0 9 3 1 3 0 1 0 1 1 0 1 0 3 1 1 1 1 0 1 1 1 1 0 1 0 1 0 0 0 1 0

Creating Data audit Report

In [13]:
# Use a general function that returns multiple values
def var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR', 'MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

num_summary=custdata_df_num.apply(lambda x: var_summary(x)).T
In [14]:
num_summary
Out[14]:
N NMISS SUM MEAN MEDIAN STD VAR MIN P1 P5 P10 P25 P50 P75 P90 P95 P99 MAX
age 5000.0 0.0 2.351280e+05 47.025600 47.000000 17.770338 3.157849e+02 18.000000 18.000000 20.000000 23.000000 31.000000 47.000000 62.000000 72.000000 76.000000 79.000000 79.000000
ed 5000.0 0.0 7.271500e+04 14.543000 14.000000 3.281083 1.076550e+01 6.000000 8.000000 9.000000 10.000000 12.000000 14.000000 17.000000 19.000000 20.000000 21.000000 23.000000
income 5000.0 0.0 2.737980e+05 54.759600 38.000000 55.377511 3.066669e+03 9.000000 9.000000 13.000000 16.000000 24.000000 38.000000 67.000000 109.100000 147.000000 272.010000 1073.000000
lninc 5000.0 0.0 1.849955e+04 3.699909 3.637586 0.747072 5.581164e-01 2.197225 2.197225 2.564949 2.772589 3.178054 3.637586 4.204693 4.692261 4.990433 5.605839 6.978214
debtinc 5000.0 0.0 4.977080e+04 9.954160 8.800000 6.399783 4.095723e+01 0.000000 0.700000 1.900000 2.800000 5.100000 8.800000 13.600000 18.600000 22.200000 29.200000 43.100000
creddebt 5000.0 0.0 9.286628e+03 1.857326 0.926437 3.415732 1.166722e+01 0.000000 0.033160 0.101088 0.175682 0.385520 0.926437 2.063820 4.299470 6.373010 14.280358 109.072596
lncreddebt 4999.0 1.0 -6.521372e+02 -0.130454 -0.076106 1.273058 1.620678e+00 -6.597334 -3.401690 -2.291604 -1.737842 -0.952685 -0.076106 0.724665 1.458625 1.852297 2.658910 4.692014
othdebt 5000.0 0.0 1.827230e+04 3.654460 2.098540 5.395172 2.910788e+01 0.000000 0.114299 0.287692 0.457997 0.980301 2.098540 4.314780 8.062046 11.815981 24.064260 141.459150
lnothdebt 4999.0 1.0 3.483879e+03 0.696915 0.741537 1.128578 1.273689e+00 -4.092107 -2.168241 -1.243483 -0.780312 -0.018987 0.741537 1.462053 2.087178 2.469586 3.180802 4.952011
spoused 5000.0 0.0 3.056400e+04 6.112800 -1.000000 7.743518 5.996207e+01 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 14.000000 16.000000 18.000000 20.000000 24.000000
reside 5000.0 0.0 1.102000e+04 2.204000 2.000000 1.393977 1.943173e+00 1.000000 1.000000 1.000000 1.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 9.000000
pets 5000.0 0.0 1.533700e+04 3.067400 2.000000 3.414497 1.165879e+01 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 5.000000 8.000000 10.000000 13.000000 21.000000
pets_cats 5000.0 0.0 2.502000e+03 0.500400 0.000000 0.860783 7.409480e-01 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 2.000000 2.000000 3.000000 6.000000
pets_dogs 5000.0 0.0 1.962000e+03 0.392400 0.000000 0.796084 6.337490e-01 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 2.000000 3.000000 7.000000
pets_birds 5000.0 0.0 5.520000e+02 0.110400 0.000000 0.494227 2.442607e-01 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 3.000000 5.000000
pets_reptiles 5000.0 0.0 2.780000e+02 0.055600 0.000000 0.325776 1.061299e-01 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 6.000000
pets_small 5000.0 0.0 5.730000e+02 0.114600 0.000000 0.568798 3.235315e-01 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 3.000000 7.000000
pets_saltfish 5000.0 0.0 2.330000e+02 0.046600 0.000000 0.469545 2.204725e-01 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.000000 8.000000
pets_freshfish 5000.0 0.0 9.237000e+03 1.847400 0.000000 3.074801 9.454404e+00 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 4.000000 7.000000 8.000000 11.000000 16.000000
carvalue 5000.0 0.0 1.161629e+05 23.232580 17.000000 21.231637 4.507824e+02 -1.000000 -1.000000 -1.000000 2.490000 9.200000 17.000000 31.100000 52.910000 72.000000 92.001000 99.600000
commutetime 4998.0 2.0 1.266770e+05 25.345538 25.000000 5.879149 3.456439e+01 8.000000 13.000000 16.000000 18.000000 21.000000 25.000000 29.000000 33.000000 35.000000 40.030000 48.000000
tenure 5000.0 0.0 1.910240e+05 38.204800 38.000000 22.661888 5.135612e+02 0.000000 1.000000 4.000000 7.000000 18.000000 38.000000 59.000000 69.000000 72.000000 72.000000 72.000000
longmon 5000.0 0.0 6.735725e+04 13.471450 9.550000 12.773381 1.631593e+02 0.900000 1.850000 2.900000 3.700000 5.700000 9.550000 16.550000 27.000000 36.757500 65.201000 179.850000
lnlongmon 5000.0 0.0 1.144390e+04 2.288779 2.256541 0.775178 6.009008e-01 -0.105361 0.615186 1.064711 1.308333 1.740466 2.256541 2.806386 3.295837 3.604342 4.177475 5.192123
longten 4997.0 3.0 3.542232e+06 708.871753 350.000000 979.291072 9.590110e+05 0.900000 2.400000 12.620000 28.290000 104.600000 350.000000 913.850000 1808.840000 2567.650000 4689.066000 13046.500000
lnlongten 4997.0 3.0 2.803966e+04 5.611298 5.857933 1.649308 2.720218e+00 -0.105361 0.875469 2.535272 3.342505 4.650144 5.857933 6.817666 7.500441 7.850745 8.452988 9.476275
tollmon 5000.0 0.0 6.632225e+04 13.264450 0.000000 16.310018 2.660167e+02 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 24.500000 35.500000 43.500000 58.752500 173.000000
lntollmon 2378.0 2622.0 7.712400e+03 3.243230 3.228826 0.404659 1.637486e-01 2.079442 2.345833 2.583998 2.740840 2.970414 3.228826 3.518980 3.789855 3.926912 4.190524 4.622519
tollten 5000.0 0.0 2.889163e+06 577.832510 0.000000 949.151586 9.008887e+05 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 885.450000 1918.955000 2620.212500 3977.270500 6923.450000
lntollten 2378.0 2622.0 1.565861e+04 6.584783 6.858013 1.222040 1.493382e+00 2.169054 2.784527 4.208490 4.821207 5.912218 6.858013 7.459900 7.882781 8.106642 8.429812 8.842669
equipmon 5000.0 0.0 6.495655e+04 12.991310 0.000000 19.212943 3.691372e+02 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 30.800000 42.700000 49.052500 63.300500 106.300000
lnequipmon 1704.0 3296.0 6.134805e+03 3.600238 3.598681 0.283385 8.030686e-02 2.833213 2.967847 3.139833 3.234749 3.412797 3.598681 3.790138 3.971235 4.065473 4.269466 4.666265
equipten 5000.0 0.0 2.350882e+06 470.176400 0.000000 912.220624 8.321465e+05 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 510.162500 1912.325000 2600.990000 3679.457500 6525.300000
lnequipten 1704.0 3296.0 1.149739e+04 6.747296 7.050556 1.199234 1.438161e+00 2.489065 3.169937 4.251230 5.033244 6.171570 7.050556 7.649835 7.977833 8.117631 8.369037 8.783442
cardmon 5000.0 0.0 7.721925e+04 15.443850 13.750000 15.007569 2.252271e+02 0.000000 0.000000 0.000000 0.000000 0.000000 13.750000 22.750000 34.000000 42.000000 64.250000 188.500000
lncardmon 3581.0 1419.0 1.041975e+04 2.909733 2.904165 0.564859 3.190658e-01 1.178655 1.658228 1.981001 2.169054 2.545531 2.904165 3.295837 3.637586 3.839452 4.239162 5.239098
cardten 4998.0 2.0 3.600951e+06 720.478391 425.000000 922.225527 8.504999e+05 0.000000 0.000000 0.000000 0.000000 0.000000 425.000000 1080.000000 1871.500000 2455.750000 4011.200000 13705.000000
lncardten 3578.0 1422.0 2.299333e+04 6.426309 6.639876 1.172050 1.373700e+00 1.558145 2.484907 4.094345 4.941642 5.857933 6.639876 7.218910 7.673223 7.923257 8.392151 9.525516
wiremon 5000.0 0.0 5.350595e+04 10.701190 0.000000 19.799837 3.920335e+02 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 20.962500 40.860000 51.305000 78.304000 186.250000
lnwiremon 1344.0 3656.0 4.845121e+03 3.605001 3.597997 0.390102 1.521793e-01 2.541602 2.806811 2.992964 3.118613 3.330417 3.597997 3.865193 4.102643 4.267282 4.577186 5.227090
wireten 5000.0 0.0 2.109923e+06 421.984610 0.000000 1001.003287 1.002008e+06 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 89.962500 1778.535000 2687.922500 4530.186000 12858.650000
lnwireten 1344.0 3656.0 9.150129e+03 6.808132 7.147185 1.283967 1.648571e+00 2.541602 3.039749 4.114134 4.881065 6.158091 7.147185 7.755376 8.106616 8.310817 8.690117 9.461772
hourstv 5000.0 0.0 9.822500e+04 19.645000 20.000000 5.165609 2.668351e+01 0.000000 0.000000 12.000000 14.000000 17.000000 20.000000 23.000000 26.000000 28.000000 31.000000 36.000000
totalspend 5000.0 0.0 2.490393e+06 498.078630 414.250000 351.529270 1.235728e+05 8.110000 58.197600 133.106000 184.033000 276.282500 414.250000 615.562500 908.125000 1145.146500 1760.102400 4881.050000
In [15]:
def cat_summary(x):
    return pd.Series([x.count(),x.isnull().sum(),x.value_counts(),x.unique()],
                    index=['N','NMISS','ColumnNames','UniqueValues'])
In [16]:
cat_summary=custdata_df_cat.apply(lambda x:cat_summary(x)).T
In [17]:
cat_summary
Out[17]:
N NMISS ColumnNames UniqueValues
region 5000 0 5 1027 1 1009 3 1003 2 995 4 ... [1, 5, 3, 4, 2]
townsize 4998 2 1.0 1436 2.0 1048 3.0 907 4.0 85... [2.0, 5.0, 4.0, 3.0, 1.0, nan]
gender 5000 0 1 2518 0 2482 Name: gender, dtype: int64 [1, 0]
agecat 5000 0 4 1222 5 1195 6 1068 3 893 2 ... [2, 6, 3, 5, 4]
birthmonth 5000 0 September 458 May 451 June ... [September, May, June, July, August, October, ...
edcat 5000 0 2 1567 4 1111 3 1022 1 946 5 ... [3, 4, 2, 1, 5]
jobcat 5000 0 2 1640 1 1388 6 688 3 620 5 ... [1, 2, 3, 6, 4, 5]
union 5000 0 0 4244 1 756 Name: union, dtype: int64 [1, 0]
employ 5000 0 0 659 1 389 2 318 3 309 4 ... [0, 16, 1, 22, 10, 11, 15, 19, 8, 4, 12, 3, 27...
empcat 5000 0 2 1180 5 1135 1 1048 3 968 4 ... [1, 5, 3, 4, 2]
retire 5000 0 0 4262 1 738 Name: retire, dtype: int64 [0, 1]
inccat 5000 0 2 1797 1 1330 3 839 4 650 5 ... [2, 1, 4, 3, 5]
default 5000 0 0 3829 1 1171 Name: default, dtype: int64 [1, 0]
jobsat 5000 0 3 1085 2 1031 4 1016 1 975 5 ... [1, 4, 2, 5, 3]
marital 5000 0 0 2599 1 2401 Name: marital, dtype: int64 [0, 1]
spousedcat 5000 0 -1 2599 2 789 1 606 3 507 4... [-1, 2, 4, 3, 1, 5]
homeown 5000 0 1 3148 0 1852 Name: homeown, dtype: int64 [0, 1]
hometype 5000 0 1 2265 2 1548 3 896 4 291 Name: ... [2, 3, 1, 4]
address 5000 0 0 245 2 196 4 195 5 177 3 ... [0, 2, 30, 3, 31, 21, 20, 19, 14, 5, 9, 32, 29...
addresscat 5000 0 3 1221 5 1157 4 1139 2 873 1 ... [1, 5, 2, 4, 3]
cars 5000 0 2 1607 1 1119 3 1082 0 497 4 ... [2, 3, 1, 0, 4, 5, 7, 6, 8]
carown 5000 0 1 3704 0 799 -1 497 Name: carown,... [1, 0, -1]
cartype 5000 0 0 2287 1 2216 -1 497 Name: cartype... [0, 1, -1]
carcatvalue 5000 0 1 2399 2 1267 3 837 -1 497 Na... [1, -1, 2, 3]
carbought 5000 0 0 2901 1 1602 -1 497 Name: carboug... [0, -1, 1]
carbuy 5000 0 0 3195 1 1805 Name: carbuy, dtype: int64 [0, 1]
commute 5000 0 1 2855 4 635 8 585 5 302 3 ... [8, 1, 4, 6, 5, 3, 10, 2, 7, 9]
commutecat 5000 0 1 2905 3 981 4 666 2 295 5 ... [4, 1, 3, 2, 5]
commutecar 5000 0 1 3395 0 1605 Name: commutecar, dtype: i... [0, 1]
commutemotorcycle 5000 0 0 4487 1 513 Name: commutemotorcycle, d... [1, 0]
... ... ... ... ...
card2tenurecat 5000 0 5 1923 2 1019 3 933 4 760 1 ... [2, 5, 3, 1, 4]
active 5000 0 0 2670 1 2330 Name: active, dtype: int64 [0, 1]
bfast 5000 0 3 1875 1 1582 2 1543 Name: bfast, dty... [3, 1, 2]
churn 5000 0 0 3734 1 1266 Name: churn, dtype: int64 [1, 0]
tollfree 5000 0 0 2622 1 2378 Name: tollfree, dtype: int64 [1, 0]
equip 5000 0 0 3296 1 1704 Name: equip, dtype: int64 [1, 0]
callcard 5000 0 1 3581 0 1419 Name: callcard, dtype: int64 [1, 0]
wireless 5000 0 0 3656 1 1344 Name: wireless, dtype: int64 [0, 1]
multline 5000 0 0 2558 1 2442 Name: multline, dtype: int64 [1, 0]
voice 5000 0 0 3485 1 1515 Name: voice, dtype: int64 [1, 0]
pager 5000 0 0 3782 1 1218 Name: pager, dtype: int64 [1, 0]
internet 5000 0 0 2498 1 774 3 598 4 585 2 ... [0, 4, 2, 3, 1]
callid 5000 0 0 2624 1 2376 Name: callid, dtype: int64 [0, 1]
callwait 5000 0 0 2605 1 2395 Name: callwait, dtype: int64 [1, 0]
forward 5000 0 0 2597 1 2403 Name: forward, dtype: int64 [1, 0]
confer 5000 0 0 2610 1 2390 Name: confer, dtype: int64 [1, 0]
ebill 5000 0 0 3257 1 1743 Name: ebill, dtype: int64 [0, 1]
owntv 5000 0 1 4915 0 85 Name: owntv, dtype: int64 [1, 0]
ownvcr 5000 0 1 4578 0 422 Name: ownvcr, dtype: int64 [1, 0]
owndvd 5000 0 1 4568 0 432 Name: owndvd, dtype: int64 [1, 0]
owncd 5000 0 1 4664 0 336 Name: owncd, dtype: int64 [0, 1]
ownpda 5000 0 0 3995 1 1005 Name: ownpda, dtype: int64 [0, 1]
ownpc 5000 0 1 3164 0 1836 Name: ownpc, dtype: int64 [0, 1]
ownipod 5000 0 0 2604 1 2396 Name: ownipod, dtype: int64 [1, 0]
owngame 5000 0 0 2626 1 2374 Name: owngame, dtype: int64 [1, 0]
ownfax 5000 0 0 4106 1 894 Name: ownfax, dtype: int64 [0, 1]
news 5000 0 0 2637 1 2363 Name: news, dtype: int64 [0, 1]
response_01 5000 0 0 4582 1 418 Name: response_01, dtype: ... [0, 1]
response_02 5000 0 0 4351 1 649 Name: response_02, dtype: ... [1, 0]
response_03 5000 0 0 4487 1 513 Name: response_03, dtype: ... [0, 1]

84 rows × 4 columns

Handling Outliers

There are some extreame high or extream low value which need to rectify so that it will not have impact on our model

In [18]:
#Handling Outliers for numerical data - Through function

def outlier_capping(x):
    x = x.clip_upper(x.quantile(0.99))
    x = x.clip_lower(x.quantile(0.01))
    return x

custdata_df_num = custdata_df_num.apply(lambda x: outlier_capping(x))

Handling Missing Values

As we can see in the above audit report some data is missing in nmerical data set and some are missing in categorical.

We will treat numerical missing data with mean() and categorical data with mode()

In [19]:
#Handling missings - by Function (Makethe function to treat all data in one shot)

def Missing_imputation(x):
    x = x.fillna(x.mean())
    return x

custdata_df_num = custdata_df_num.apply(lambda x: Missing_imputation(x))
In [20]:
#Handling missings - by Function (Makethe function to treat all data in one shot)

def Missing_imputation(x):
    x = x.fillna(x.mode())
    return x

custdata_df_cat = custdata_df_cat.apply(lambda x: Missing_imputation(x))

Again check missing values has been treated or not

In [21]:
# Find the total number of missing values in the numerical data i.e custdata_df_num
print ("\nMissing values in numerical data :  ", custdata_df_num.isnull().sum().values.sum())
print ("\nMissing values in categorical data :  ", custdata_df_num.isnull().sum().values.sum())
Missing values in numerical data :   0

Missing values in categorical data :   0

Dummy creation for categorical data

In [22]:
# We need to create a function for dummy creation
def create_dummies(df, colname):
    col_dummies = pd.get_dummies(df[colname], prefix = colname)
    col_dummies.drop(col_dummies.columns[0], axis = 1, inplace = True)
    df = pd.concat([df, col_dummies], axis = 1)
    df.drop(colname, axis = 1, inplace = True)
    return df
In [23]:
custdata_df_cat=custdata_df.select_dtypes(include=['object'])
cat_varlist=list(custdata_df_cat.columns)
In [24]:
# For customer_features in categorical features

for c_feature in cat_varlist:
    custdata_df_cat[c_feature]=custdata_df_cat[c_feature].astype('category')
    custdata_df_cat=create_dummies(custdata_df_cat,c_feature)
In [25]:
custdata_df_cat.sample(5)
Out[25]:
region_2 region_3 region_4 region_5 townsize_2.0 townsize_3.0 townsize_4.0 townsize_5.0 gender_1 agecat_3 agecat_4 agecat_5 agecat_6 birthmonth_August birthmonth_December birthmonth_February birthmonth_January birthmonth_July birthmonth_June birthmonth_March birthmonth_May birthmonth_November birthmonth_October birthmonth_September edcat_2 edcat_3 edcat_4 edcat_5 jobcat_2 jobcat_3 jobcat_4 jobcat_5 jobcat_6 union_1 employ_1 employ_2 employ_3 employ_4 employ_5 employ_6 employ_7 employ_8 employ_9 employ_10 employ_11 employ_12 employ_13 employ_14 employ_15 employ_16 employ_17 employ_18 employ_19 employ_20 employ_21 employ_22 employ_23 employ_24 employ_25 employ_26 employ_27 employ_28 employ_29 employ_30 employ_31 employ_32 employ_33 employ_34 employ_35 employ_36 employ_37 employ_38 employ_39 employ_40 employ_41 employ_42 employ_43 employ_44 employ_45 employ_46 employ_47 employ_48 employ_49 employ_51 employ_52 empcat_2 empcat_3 empcat_4 empcat_5 retire_1 inccat_2 inccat_3 inccat_4 inccat_5 default_1 jobsat_2 jobsat_3 jobsat_4 jobsat_5 marital_1 spousedcat_1 spousedcat_2 spousedcat_3 spousedcat_4 spousedcat_5 homeown_1 hometype_2 hometype_3 hometype_4 address_1 address_2 address_3 address_4 address_5 address_6 address_7 address_8 address_9 address_10 address_11 address_12 address_13 address_14 address_15 address_16 address_17 address_18 address_19 address_20 address_21 address_22 address_23 address_24 address_25 address_26 address_27 address_28 address_29 address_30 address_31 address_32 address_33 address_34 address_35 address_36 address_37 address_38 address_39 address_40 address_41 address_42 address_43 address_44 address_45 address_46 address_47 address_48 address_49 address_50 address_51 address_52 address_53 address_54 address_55 address_57 addresscat_2 addresscat_3 addresscat_4 addresscat_5 cars_1 cars_2 cars_3 cars_4 cars_5 cars_6 cars_7 cars_8 carown_0 carown_1 cartype_0 cartype_1 carcatvalue_1 carcatvalue_2 carcatvalue_3 carbought_0 carbought_1 carbuy_1 commute_2 commute_3 commute_4 commute_5 commute_6 commute_7 commute_8 commute_9 commute_10 commutecat_2 commutecat_3 commutecat_4 commutecat_5 commutecar_1 commutemotorcycle_1 commutecarpool_1 commutebus_1 commuterail_1 commutepublic_1 commutebike_1 commutewalk_1 commutenonmotor_1 telecommute_1 reason_2 reason_3 reason_4 reason_9 polview_2 polview_3 polview_4 polview_5 polview_6 polview_7 polparty_1 polcontrib_1 vote_1 card_2 card_3 card_4 card_5 cardtype_2 cardtype_3 cardtype_4 cardbenefit_2 cardbenefit_3 cardbenefit_4 cardfee_1 cardtenure_1 cardtenure_2 cardtenure_3 cardtenure_4 cardtenure_5 cardtenure_6 cardtenure_7 cardtenure_8 cardtenure_9 cardtenure_10 cardtenure_11 cardtenure_12 cardtenure_13 cardtenure_14 cardtenure_15 cardtenure_16 cardtenure_17 cardtenure_18 cardtenure_19 cardtenure_20 cardtenure_21 cardtenure_22 cardtenure_23 cardtenure_24 cardtenure_25 cardtenure_26 cardtenure_27 cardtenure_28 cardtenure_29 cardtenure_30 cardtenure_31 cardtenure_32 cardtenure_33 cardtenure_34 cardtenure_35 cardtenure_36 cardtenure_37 cardtenure_38 cardtenure_39 cardtenure_40 cardtenurecat_2 cardtenurecat_3 cardtenurecat_4 cardtenurecat_5 card2_2 card2_3 card2_4 card2_5 card2type_2 card2type_3 card2type_4 card2benefit_2 card2benefit_3 card2benefit_4 card2fee_1 card2tenure_1 card2tenure_2 card2tenure_3 card2tenure_4 card2tenure_5 card2tenure_6 card2tenure_7 card2tenure_8 card2tenure_9 card2tenure_10 card2tenure_11 card2tenure_12 card2tenure_13 card2tenure_14 card2tenure_15 card2tenure_16 card2tenure_17 card2tenure_18 card2tenure_19 card2tenure_20 card2tenure_21 card2tenure_22 card2tenure_23 card2tenure_24 card2tenure_25 card2tenure_26 card2tenure_27 card2tenure_28 card2tenure_29 card2tenure_30 card2tenurecat_2 card2tenurecat_3 card2tenurecat_4 card2tenurecat_5 active_1 bfast_2 bfast_3 churn_1 tollfree_1 equip_1 callcard_1 wireless_1 multline_1 voice_1 pager_1 internet_1 internet_2 internet_3 internet_4 callid_1 callwait_1 forward_1 confer_1 ebill_1 owntv_1 ownvcr_1 owndvd_1 owncd_1 ownpda_1 ownpc_1 ownipod_1 owngame_1 ownfax_1 news_1 response_01_1 response_02_1 response_03_1
1760 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 0 1 0 0 0 1 0
789 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0
1830 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 1 1 0 1 0 1 0 1 0 0 0
2970 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
4839 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 1 0 1 0

Merge Numerical and categorical data

Now we have nice simple clean data for numerical and categorical both data frame, so now we can merge both data set

In [26]:
custdata_df_new = pd.concat([custdata_df_num, custdata_df_cat], axis=1)
In [27]:
custdata_df_new.head()
Out[27]:
age ed income lninc debtinc creddebt lncreddebt othdebt lnothdebt spoused reside pets pets_cats pets_dogs pets_birds pets_reptiles pets_small pets_saltfish pets_freshfish carvalue commutetime tenure longmon lnlongmon longten lnlongten tollmon lntollmon tollten lntollten equipmon lnequipmon equipten lnequipten cardmon lncardmon cardten lncardten wiremon lnwiremon wireten lnwireten hourstv totalspend region_2 region_3 region_4 region_5 townsize_2.0 townsize_3.0 townsize_4.0 townsize_5.0 gender_1 agecat_3 agecat_4 agecat_5 agecat_6 birthmonth_August birthmonth_December birthmonth_February birthmonth_January birthmonth_July birthmonth_June birthmonth_March birthmonth_May birthmonth_November birthmonth_October birthmonth_September edcat_2 edcat_3 edcat_4 edcat_5 jobcat_2 jobcat_3 jobcat_4 jobcat_5 jobcat_6 union_1 employ_1 employ_2 employ_3 employ_4 employ_5 employ_6 employ_7 employ_8 employ_9 employ_10 employ_11 employ_12 employ_13 employ_14 employ_15 employ_16 employ_17 employ_18 employ_19 employ_20 employ_21 employ_22 employ_23 employ_24 employ_25 employ_26 employ_27 employ_28 employ_29 employ_30 employ_31 employ_32 employ_33 employ_34 employ_35 employ_36 employ_37 employ_38 employ_39 employ_40 employ_41 employ_42 employ_43 employ_44 employ_45 employ_46 employ_47 employ_48 employ_49 employ_51 employ_52 empcat_2 empcat_3 empcat_4 empcat_5 retire_1 inccat_2 inccat_3 inccat_4 inccat_5 default_1 jobsat_2 jobsat_3 jobsat_4 jobsat_5 marital_1 spousedcat_1 spousedcat_2 spousedcat_3 spousedcat_4 spousedcat_5 homeown_1 hometype_2 hometype_3 hometype_4 address_1 address_2 address_3 address_4 address_5 address_6 address_7 address_8 address_9 address_10 address_11 address_12 address_13 address_14 address_15 address_16 address_17 address_18 address_19 address_20 address_21 address_22 address_23 address_24 address_25 address_26 address_27 address_28 address_29 address_30 address_31 address_32 address_33 address_34 address_35 address_36 address_37 address_38 address_39 address_40 address_41 address_42 address_43 address_44 address_45 address_46 address_47 address_48 address_49 address_50 address_51 address_52 address_53 address_54 address_55 address_57 addresscat_2 addresscat_3 addresscat_4 addresscat_5 cars_1 cars_2 cars_3 cars_4 cars_5 cars_6 cars_7 cars_8 carown_0 carown_1 cartype_0 cartype_1 carcatvalue_1 carcatvalue_2 carcatvalue_3 carbought_0 carbought_1 carbuy_1 commute_2 commute_3 commute_4 commute_5 commute_6 commute_7 commute_8 commute_9 commute_10 commutecat_2 commutecat_3 commutecat_4 commutecat_5 commutecar_1 commutemotorcycle_1 commutecarpool_1 commutebus_1 commuterail_1 commutepublic_1 commutebike_1 commutewalk_1 commutenonmotor_1 telecommute_1 reason_2 reason_3 reason_4 reason_9 polview_2 polview_3 polview_4 polview_5 polview_6 polview_7 polparty_1 polcontrib_1 vote_1 card_2 card_3 card_4 card_5 cardtype_2 cardtype_3 cardtype_4 cardbenefit_2 cardbenefit_3 cardbenefit_4 cardfee_1 cardtenure_1 cardtenure_2 cardtenure_3 cardtenure_4 cardtenure_5 cardtenure_6 cardtenure_7 cardtenure_8 cardtenure_9 cardtenure_10 cardtenure_11 cardtenure_12 cardtenure_13 cardtenure_14 cardtenure_15 cardtenure_16 cardtenure_17 cardtenure_18 cardtenure_19 cardtenure_20 cardtenure_21 cardtenure_22 cardtenure_23 cardtenure_24 cardtenure_25 cardtenure_26 cardtenure_27 cardtenure_28 cardtenure_29 cardtenure_30 cardtenure_31 cardtenure_32 cardtenure_33 cardtenure_34 cardtenure_35 cardtenure_36 cardtenure_37 cardtenure_38 cardtenure_39 cardtenure_40 cardtenurecat_2 cardtenurecat_3 cardtenurecat_4 cardtenurecat_5 card2_2 card2_3 card2_4 card2_5 card2type_2 card2type_3 card2type_4 card2benefit_2 card2benefit_3 card2benefit_4 card2fee_1 card2tenure_1 card2tenure_2 card2tenure_3 card2tenure_4 card2tenure_5 card2tenure_6 card2tenure_7 card2tenure_8 card2tenure_9 card2tenure_10 card2tenure_11 card2tenure_12 card2tenure_13 card2tenure_14 card2tenure_15 card2tenure_16 card2tenure_17 card2tenure_18 card2tenure_19 card2tenure_20 card2tenure_21 card2tenure_22 card2tenure_23 card2tenure_24 card2tenure_25 card2tenure_26 card2tenure_27 card2tenure_28 card2tenure_29 card2tenure_30 card2tenurecat_2 card2tenurecat_3 card2tenurecat_4 card2tenurecat_5 active_1 bfast_2 bfast_3 churn_1 tollfree_1 equip_1 callcard_1 wireless_1 multline_1 voice_1 pager_1 internet_1 internet_2 internet_3 internet_4 callid_1 callwait_1 forward_1 confer_1 ebill_1 owntv_1 ownvcr_1 owndvd_1 owncd_1 ownpda_1 ownpc_1 ownipod_1 owngame_1 ownfax_1 news_1 response_01_1 response_02_1 response_03_1
0 20.0 15.0 31.0 3.433987 11.1 1.200909 0.183079 2.240091 0.806516 -1.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 14.3 22.0 5.0 6.50 1.871802 34.40 3.538057 29.0 3.367296 161.05 5.081715 29.50 3.384390 126.1 4.837075 14.25 2.656757 60.0 4.094345 0.00 3.604226 0.00 6.808151 13.0 149.46 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 1 1 1 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0 1 1 0 0 0 1 0
1 22.0 17.0 15.0 2.708050 18.6 1.222020 0.200505 1.567980 0.449788 -1.0 2.0 6.0 0.0 0.0 0.0 0.0 0.0 0.0 6.0 6.8 29.0 39.0 8.90 2.186051 330.60 5.800909 0.0 3.242727 0.00 6.585937 54.85 4.004602 1975.0 7.588324 16.00 2.772589 610.0 6.413459 45.65 3.821004 1683.55 7.428660 18.0 77.54 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0
2 67.0 14.0 35.0 3.555348 9.9 0.928620 -0.074056 2.536380 0.930738 13.0 3.0 3.0 2.0 1.0 0.0 0.0 0.0 0.0 0.0 18.8 24.0 65.0 28.40 3.346389 1858.35 7.527444 0.0 3.242727 0.00 6.585937 0.00 3.599725 0.0 6.747846 23.00 3.135494 1410.0 7.251345 0.00 3.604226 0.00 6.808151 21.0 359.97 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0
3 23.0 16.0 20.0 2.995732 5.7 0.033160 -3.401690 1.117200 0.110826 18.0 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8.7 38.0 36.0 6.00 1.791759 199.45 5.295564 0.0 3.242727 0.00 6.585937 0.00 3.599725 0.0 6.747846 21.00 3.044522 685.0 6.529419 0.00 3.604226 0.00 6.808151 26.0 359.41 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0
4 26.0 16.0 23.0 3.135494 1.7 0.214659 -1.538705 0.176341 -1.735336 13.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 10.6 32.0 21.0 3.05 1.115142 74.10 4.305416 16.5 2.803360 387.70 5.960232 0.00 3.599725 0.0 6.747846 17.25 2.847812 360.0 5.886104 19.05 2.947067 410.80 6.018106 27.0 507.83 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 1 1 0 1 0 0 0 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1 0 0 0 1 0
In [27]:
# Some features has highly correlation with their log valuesso I am going to drop it.

#custdata_df_new.drop(['cardten', 'lninc','lncreddebt','lnothdebt','lnlongmon','lnlongten','lntollmon','lntollten',
#'lnequipmon','lnequipten','lncardten','lnwiremon','lnwireten'],axis=1, inplace=True)
In [28]:
custdata_df_new.shape
Out[28]:
(5000, 400)

Explore data and check the variable distribution

In [29]:
# For Linear regressin y(totalspend) should follow normal istribution
import seaborn as sns
sns.distplot(custdata_df_new.totalspend)
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0xc5866a0>

Now as you can see distribution is right skewed so we need to take log and then plot the graph.

In [30]:
from scipy import stats
import pylab

stats.probplot(custdata_df_new.totalspend, dist="norm", plot=pylab )
pylab.show()

As we can see above graph our data is not distributed normally perfectly, so we will use boxcox techniue to make it perfect normal

In [31]:
from scipy import stats

# transform training data & save lambda value
custdata_df_new['bc_total_spend'],fitted_lambda = stats.boxcox(custdata_df_new['totalspend'])
In [32]:
sns.distplot(custdata_df_new.bc_total_spend)
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0xc010390>
In [33]:
stats.probplot(custdata_df_new.bc_total_spend, dist="norm", plot=pylab )
pylab.show()
In [34]:
#Drop the y variables(totalspend) from dataframe because we are looking correlation between all x variables

custdata_df_new.drop(['totalspend'],axis=1,inplace=True)

Devide data into train and test data

In [35]:
#Splitting the data in all x variable and y variable.

feature_columns=custdata_df_new.columns.difference(['bc_total_spend'])
In [36]:
from sklearn.model_selection import train_test_split

train_x,test_x,train_y,test_y=train_test_split(custdata_df_new[feature_columns],
                                              custdata_df_new['bc_total_spend'],
                                              test_size=0.2,
                                              random_state=12)
In [37]:
print (len(train_x))
print (len(test_x))
print (len(train_y))
print (len(test_y))
4000
1000
4000
1000

Feature selection by random forrest

In [38]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
In [39]:
RandomForestRegressor?
In [40]:
param_grid={'n_estimators':np.arange(10,25)}

tree=GridSearchCV(RandomForestRegressor(oob_score=False,warm_start=True),param_grid,cv=2)
tree.fit(train_x,train_y)  
Out[40]:
GridSearchCV(cv=2, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)
In [41]:
tree.best_params_
Out[41]:
{'n_estimators': 23}
In [42]:
# we can take n_estimators': 23
radm_clf = RandomForestRegressor(oob_score=True,n_estimators=23)
radm_clf.fit( train_x, train_y)
Out[42]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=23, n_jobs=1,
           oob_score=True, random_state=None, verbose=0, warm_start=False)
In [43]:
radm_clf.oob_score_
Out[43]:
0.15767032576065787
In [44]:
from sklearn import metrics
In [45]:
indices = np.argsort(radm_clf.feature_importances_)[::-1]
feature_rank = pd.DataFrame( columns = ['rank', 'feature', 'importance'] )
for f in range(train_x.shape[1]):
  feature_rank.loc[f] = [f+1,
                         train_x.columns[indices[f]],
                         radm_clf.feature_importances_[indices[f]]]
sns.barplot( y = 'feature', x = 'importance', data = feature_rank )
Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0xc09e4a8>
  • As above graph is not clear so we will use below method.
In [46]:
indices = np.argsort(radm_clf.feature_importances_)[::-1]
feature_rank = pd.DataFrame( columns = ['rank', 'feature', 'importance'] )
for f in range(train_x.shape[1]):
  feature_rank.loc[f] = [f+1,
                         train_x.columns[indices[f]],
                         radm_clf.feature_importances_[indices[f]]]

feature_rank
Out[46]:
rank feature importance
0 1 income 1.037454e-01
1 2 lninc 7.589780e-02
2 3 card_3 2.596244e-02
3 4 card_2 2.239525e-02
4 5 carvalue 2.154065e-02
5 6 card_4 2.112358e-02
6 7 reason_2 2.094437e-02
7 8 debtinc 1.930640e-02
8 9 commutetime 1.846777e-02
9 10 hourstv 1.661541e-02
10 11 age 1.652895e-02
11 12 lncreddebt 1.468481e-02
12 13 creddebt 1.400680e-02
13 14 othdebt 1.242060e-02
14 15 ed 1.229365e-02
15 16 lncardmon 1.194613e-02
16 17 tenure 1.115788e-02
17 18 lnothdebt 1.040137e-02
18 19 pets 1.001150e-02
19 20 lncardten 9.964655e-03
20 21 cardmon 9.230549e-03
21 22 lntollmon 9.129050e-03
22 23 longmon 9.106174e-03
23 24 card_5 9.043518e-03
24 25 spoused 8.085497e-03
25 26 cardten 7.930730e-03
26 27 lnlongten 7.926722e-03
27 28 lnequipten 7.834682e-03
28 29 lnlongmon 7.799020e-03
29 30 longten 7.240294e-03
... ... ... ...
369 370 employ_45 8.859233e-05
370 371 address_45 7.747346e-05
371 372 owntv_1 6.995432e-05
372 373 reason_3 5.521577e-05
373 374 address_53 5.338786e-05
374 375 inccat_5 5.133915e-05
375 376 employ_38 5.067010e-05
376 377 address_33 4.766309e-05
377 378 address_23 4.622708e-05
378 379 employ_26 3.523784e-05
379 380 cars_7 3.163522e-05
380 381 address_46 3.095265e-05
381 382 address_41 1.586377e-05
382 383 employ_36 1.417776e-05
383 384 address_43 4.679721e-06
384 385 address_49 2.893917e-06
385 386 address_50 2.708603e-06
386 387 employ_42 7.787210e-07
387 388 employ_47 1.211027e-07
388 389 address_55 1.107699e-07
389 390 address_52 4.552084e-08
390 391 address_54 9.325467e-09
391 392 employ_52 0.000000e+00
392 393 address_57 0.000000e+00
393 394 employ_51 0.000000e+00
394 395 employ_41 0.000000e+00
395 396 employ_49 0.000000e+00
396 397 employ_48 0.000000e+00
397 398 employ_46 0.000000e+00
398 399 cars_8 0.000000e+00

399 rows × 3 columns

In [47]:
#Select features and then convert it into list
x=feature_rank.loc[0:75,['feature']]
x=x['feature'].tolist()
print(x)
['income', 'lninc', 'card_3', 'card_2', 'carvalue', 'card_4', 'reason_2', 'debtinc', 'commutetime', 'hourstv', 'age', 'lncreddebt', 'creddebt', 'othdebt', 'ed', 'lncardmon', 'tenure', 'lnothdebt', 'pets', 'lncardten', 'cardmon', 'lntollmon', 'longmon', 'card_5', 'spoused', 'cardten', 'lnlongten', 'lnequipten', 'lnlongmon', 'longten', 'card2_2', 'lntollten', 'lnequipmon', 'tollmon', 'card2_3', 'lnwiremon', 'gender_1', 'reside', 'tollten', 'pets_cats', 'pets_dogs', 'pets_freshfish', 'equipmon', 'commutewalk_1', 'polparty_1', 'lnwireten', 'polview_4', 'card2benefit_3', 'union_1', 'spousedcat_2', 'card2benefit_2', 'card2type_2', 'vote_1', 'carown_0', 'equipten', 'region_3', 'card2_4', 'carbought_1', 'carbuy_1', 'jobsat_4', 'birthmonth_October', 'active_1', 'townsize_3.0', 'cardbenefit_4', 'region_5', 'cartype_0', 'cardtype_2', 'cardtype_4', 'forward_1', 'commutebus_1', 'jobsat_3', 'commutecar_1', 'card2benefit_4', 'townsize_4.0', 'commuterail_1', 'wiremon']
In [48]:
# Create data frame with selected features
rf_features=['income', 'lninc', 'card_3', 'card_2', 'carvalue', 'card_4', 'commutetime', 'reason_2', 'debtinc', 'hourstv', 'creddebt', 'age', 'lncreddebt', 'tenure', 'lncardmon', 'lnothdebt', 'ed', 'othdebt', 'pets', 'lncardten', 'lntollmon', 'cardmon', 'longmon', 'card_5', 'lnlongmon', 'reside', 'spoused', 'cardten', 'lntollten', 'lnequipmon', 'lnlongten', 'tollten', 'longten', 'tollmon', 'lnwiremon', 'card2_2', 'card2_3', 'pets_freshfish', 'lnequipten', 'gender_1', 'pets_dogs', 'equipten', 'wireten', 'pets_cats', 'address_22', 'equipmon', 'wiremon', 'jobsat_4', 'commutebus_1', 'card2benefit_4']

rf_features.append('bc_total_spend')
df_rf= custdata_df_new[rf_features]

df_rf.head(5)
Out[48]:
income lninc card_3 card_2 carvalue card_4 commutetime reason_2 debtinc hourstv creddebt age lncreddebt tenure lncardmon lnothdebt ed othdebt pets lncardten lntollmon cardmon longmon card_5 lnlongmon reside spoused cardten lntollten lnequipmon lnlongten tollten longten tollmon lnwiremon card2_2 card2_3 pets_freshfish lnequipten gender_1 pets_dogs equipten wireten pets_cats address_22 equipmon wiremon jobsat_4 commutebus_1 card2benefit_4 bc_total_spend
0 31.0 3.433987 1 0 14.3 0 22.0 0 11.1 13.0 1.200909 20.0 0.183079 5.0 2.656757 0.806516 15.0 2.240091 0.0 4.094345 3.367296 14.25 6.50 0 1.871802 3.0 -1.0 60.0 5.081715 3.384390 3.538057 161.05 34.40 29.0 3.604226 0 0 0.0 4.837075 1 0.0 126.1 0.00 0.0 0 29.50 0.00 0 0 0 7.323447
1 15.0 2.708050 0 1 6.8 0 29.0 0 18.6 18.0 1.222020 22.0 0.200505 39.0 2.772589 0.449788 17.0 1.567980 6.0 6.413459 3.242727 16.00 8.90 0 2.186051 2.0 -1.0 610.0 6.585937 4.004602 5.800909 0.00 330.60 0.0 3.821004 0 0 6.0 7.588324 0 0.0 1975.0 1683.55 0.0 0 54.85 45.65 0 1 0 6.039640
2 35.0 3.555348 0 1 18.8 0 24.0 1 9.9 21.0 0.928620 67.0 -0.074056 65.0 3.135494 0.930738 14.0 2.536380 3.0 7.251345 3.242727 23.00 28.40 0 3.346389 3.0 13.0 1410.0 6.585937 3.599725 7.527444 0.00 1858.35 0.0 3.604226 0 0 0.0 6.747846 1 1.0 0.0 0.00 2.0 0 0.00 0.00 1 1 0 9.243615
3 20.0 2.995732 0 1 8.7 0 38.0 0 5.7 26.0 0.033160 23.0 -3.401690 36.0 3.044522 0.110826 16.0 1.117200 0.0 6.529419 3.242727 21.00 6.00 0 1.791759 5.0 18.0 685.0 6.585937 3.599725 5.295564 0.00 199.45 0.0 3.604226 0 1 0.0 6.747846 0 0.0 0.0 0.00 0.0 0 0.00 0.00 0 0 1 9.239995
4 23.0 3.135494 0 0 10.6 1 32.0 0 1.7 27.0 0.214659 26.0 -1.538705 21.0 2.847812 -1.735336 16.0 0.176341 0.0 5.886104 2.803360 17.25 3.05 0 1.115142 4.0 13.0 360.0 5.960232 3.599725 4.305416 387.70 74.10 16.5 2.947067 0 0 0.0 6.747846 0 0.0 0.0 410.80 0.0 0 0.00 19.05 0 0 0 10.063779

Check - 2: Find multicolinearlity

In [49]:
import statsmodels as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices
In [50]:
del rf_features[-1]
In [51]:
%%capture
#gather features
features = "+".join(rf_features)
In [52]:
features
Out[52]:
'income+lninc+card_3+card_2+carvalue+card_4+commutetime+reason_2+debtinc+hourstv+creddebt+age+lncreddebt+tenure+lncardmon+lnothdebt+ed+othdebt+pets+lncardten+lntollmon+cardmon+longmon+card_5+lnlongmon+reside+spoused+cardten+lntollten+lnequipmon+lnlongten+tollten+longten+tollmon+lnwiremon+card2_2+card2_3+pets_freshfish+lnequipten+gender_1+pets_dogs+equipten+wireten+pets_cats+address_22+equipmon+wiremon+jobsat_4+commutebus_1+card2benefit_4'
In [53]:
# get y and X dataframes based on this regression:
y, X = dmatrices('bc_total_spend~' + features,df_rf, return_type='dataframe')
In [54]:
# For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
In [55]:
vif.sort_values(by=['VIF Factor'],ascending=False)
Out[55]:
VIF Factor features
0 1686.859676 Intercept
23 148.928795 longmon
33 117.386519 longten
25 33.762250 lnlongmon
31 31.415359 lnlongten
14 24.098226 tenure
32 21.097455 tollten
42 19.936232 equipten
34 15.002808 tollmon
46 14.944802 equipmon
28 14.474511 cardten
19 13.324262 pets
47 13.134309 wiremon
43 12.799070 wireten
38 11.869724 pets_freshfish
22 10.962786 cardmon
2 9.948275 lninc
1 9.725833 income
29 8.130066 lntollten
20 7.905704 lncardten
16 7.517424 lnothdebt
39 6.498882 lnequipten
9 6.486941 debtinc
15 5.797448 lncardmon
18 5.167748 othdebt
13 4.276376 lncreddebt
11 3.947735 creddebt
5 3.534663 carvalue
21 3.065122 lntollmon
30 2.381248 lnequipmon
26 2.006284 reside
12 1.972903 age
4 1.882678 card_2
3 1.856482 card_3
27 1.833635 spoused
44 1.833349 pets_cats
6 1.788595 card_4
41 1.742336 pets_dogs
35 1.599433 lnwiremon
17 1.465111 ed
37 1.311558 card2_3
36 1.294873 card2_2
24 1.257129 card_5
48 1.039194 jobsat_4
10 1.037751 hourstv
8 1.022514 reason_2
40 1.021022 gender_1
45 1.019974 address_22
7 1.015482 commutetime
49 1.013662 commutebus_1
50 1.006757 card2benefit_4
In [56]:
#Select only those features whose Vif is greater than 10.
vif1=vif[vif['VIF Factor']>10].reset_index().loc[:,['features']]
vif1.drop([0],axis=0,inplace=True)
drop_vars= vif1["features"].tolist()
drop_vars
Out[56]:
['tenure',
 'pets',
 'cardmon',
 'longmon',
 'lnlongmon',
 'cardten',
 'lnlongten',
 'tollten',
 'longten',
 'tollmon',
 'pets_freshfish',
 'equipten',
 'wireten',
 'equipmon',
 'wiremon']
In [57]:
# dropping variables that have VIF greater than 10
df_rf.drop(drop_vars,axis=1,inplace=True)
In [58]:
df_rf.shape
Out[58]:
(5000, 36)

Check 3 - All x variables should have a linear relationship with Y

In [59]:
sns.lmplot(x="income",y="bc_total_spend",data=df_rf,aspect=5,scatter_kws={'alpha':0.5})
Out[59]:
<seaborn.axisgrid.FacetGrid at 0xb319320>

Splitting data for training and testing

In [60]:
# Dropping variables one at a time which have p-values greater than 5%
feature_columns=df_rf.columns.difference(['bc_total_spend','address_22','pets_cats','lncardmon','lnequipten','spoused'
                                         ,'creddebt','reside','othdebt','debtinc','lnothdebt','jobsat_4','income','carvalue',
                                         'card2benefit_4','commutebus_1','hourstv','lnwiremon','pets_dogs','lncardten',
                                         'commutetime','lntollten','lntollmon','lnequipmon'])
In [61]:
train_x,test_x,train_y,test_y=train_test_split(df_rf[feature_columns],
                                              df_rf['bc_total_spend'],
                                               test_size=.2,
                                               random_state=123)

Build Regression model using statsmodels.api

In [62]:
import statsmodels.api as sm
In [63]:
train_x = sm.add_constant(train_x)
test_x=sm.add_constant(test_x)
lm=sm.OLS(train_y,train_x).fit()
In [64]:
print(lm.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:         bc_total_spend   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.323
Method:                 Least Squares   F-statistic:                     159.7
Date:                Tue, 01 Oct 2019   Prob (F-statistic):               0.00
Time:                        22:46:22   Log-Likelihood:                -6598.7
No. Observations:                4000   AIC:                         1.322e+04
Df Residuals:                    3987   BIC:                         1.331e+04
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          8.2401      0.172     47.920      0.000       7.903       8.577
age           -0.0042      0.001     -3.681      0.000      -0.006      -0.002
card2_2       -0.3794      0.052     -7.349      0.000      -0.481      -0.278
card2_3       -0.3452      0.051     -6.788      0.000      -0.445      -0.246
card_2        -1.2433      0.064    -19.525      0.000      -1.368      -1.118
card_3        -1.2391      0.064    -19.465      0.000      -1.364      -1.114
card_4        -1.2693      0.060    -21.022      0.000      -1.388      -1.151
card_5        -0.9928      0.108     -9.214      0.000      -1.204      -0.782
ed            -0.0154      0.007     -2.349      0.019      -0.028      -0.003
gender_1      -0.1489      0.040     -3.713      0.000      -0.227      -0.070
lncreddebt     0.0396      0.020      1.997      0.046       0.001       0.079
lninc          0.8090      0.034     23.600      0.000       0.742       0.876
reason_2       0.7071      0.081      8.746      0.000       0.549       0.866
==============================================================================
Omnibus:                        4.553   Durbin-Watson:                   1.976
Prob(Omnibus):                  0.103   Jarque-Bera (JB):                4.573
Skew:                          -0.082   Prob(JB):                        0.102
Kurtosis:                       2.983   Cond. No.                         473.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [65]:
lm.pvalues.sort_values(ascending=False)
Out[65]:
lncreddebt     4.587652e-02
ed             1.887734e-02
age            2.357317e-04
gender_1       2.078595e-04
card2_3        1.303811e-11
card2_2        2.408523e-13
reason_2       3.210677e-18
card_5         4.947423e-20
card_3         1.090371e-80
card_2         3.707595e-81
card_4         3.890897e-93
lninc         2.235272e-115
const          0.000000e+00
dtype: float64
In [66]:
print('Parameters:', lm.params) # Find the parameters of x i.e Beta value
print('R2: ', lm.rsquared) # Find the r**2
Parameters: const         8.240115
age          -0.004194
card2_2      -0.379415
card2_3      -0.345223
card_2       -1.243274
card_3       -1.239082
card_4       -1.269314
card_5       -0.992801
ed           -0.015444
gender_1     -0.148872
lncreddebt    0.039624
lninc         0.809044
reason_2      0.707096
dtype: float64
R2:  0.3246950341215388

Eveluation of model accuracy

In [67]:
test_pred=lm.predict(test_x)
train_pred=lm.predict(train_x)

from sklearn import metrics

print('MSE Test:',metrics.mean_squared_error(test_y,test_pred))
print('MSE Train:',metrics.mean_squared_error(train_y,train_pred))
MSE Test: 1.5275403610908898
MSE Train: 1.5864409028377782
In [68]:
print ('MAE:', metrics.mean_absolute_error(test_y, test_pred))
print ('MSE:', metrics.mean_squared_error(test_y, test_pred))
print ('RMSE:', np.sqrt(metrics.mean_squared_error(test_y, test_pred)))
MAE: 0.9762061585444958
MSE: 1.5275403610908898
RMSE: 1.2359370376725871
In [69]:
MAPE_train = '%.3f' % np.mean(np.abs(train_y-train_pred)/(train_y))
MAPE_test = '%.3f' % np.mean(np.abs(test_y-test_pred)/(test_y))

# print the values of MAPE for train and test
print('MAPE of training data: ', MAPE_train,  ' | ', 'MAPE of testing data: ', MAPE_test)
MAPE of training data:  0.110  |  MAPE of testing data:  0.106

Check Normality and Residuals

We will use Q-Q plot to examine this

In [70]:
residuals=train_y-train_pred

import seaborn as sns

sns.distplot(residuals)
Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0xf889278>
In [71]:
from scipy import stats
import pylab

stats.probplot(residuals,dist='norm',plot=pylab)
pylab.show()

From the above histogram and Q-Q plot, shows that the residuals are normally distributed, so our assumption is not violated

Predicting sales:

We are going to credit card sales on Testing data

In [73]:
df_rf=df_rf[feature_columns]

df_rf=sm.add_constant(df_rf)

df_rf.head()
Out[73]:
const age card2_2 card2_3 card_2 card_3 card_4 card_5 ed gender_1 lncreddebt lninc reason_2
0 1.0 20.0 0 0 0 1 0 0 15.0 1 0.183079 3.433987 0
1 1.0 22.0 0 0 1 0 0 0 17.0 0 0.200505 2.708050 0
2 1.0 67.0 0 0 1 0 0 0 14.0 1 -0.074056 3.555348 1
3 1.0 23.0 0 1 1 0 0 0 16.0 0 -3.401690 2.995732 0
4 1.0 26.0 0 0 0 0 1 0 16.0 0 -1.538705 3.135494 0
In [75]:
# Final prediction on test file
from scipy.special import boxcox, inv_boxcox

pred_total_spend=inv_boxcox(lm.predict(df_rf),fitted_lambda)
pred_total_spend=pd.DataFrame(pred_total_spend.tolist())
pred_total_spend.columns = ['pred_total_spend']


#Concatenating final prediction with original test file
  
testfile = pd.concat([custdata_df, pred_total_spend], axis=1)

testfile.head()
Out[75]:
region townsize gender age agecat birthmonth ed edcat jobcat union employ empcat retire income lninc inccat debtinc creddebt lncreddebt othdebt lnothdebt default jobsat marital spoused spousedcat reside pets pets_cats pets_dogs pets_birds pets_reptiles pets_small pets_saltfish pets_freshfish homeown hometype address addresscat cars carown cartype carvalue carcatvalue carbought carbuy commute commutecat commutetime commutecar commutemotorcycle commutecarpool commutebus commuterail commutepublic commutebike commutewalk commutenonmotor telecommute reason polview polparty polcontrib vote card cardtype cardbenefit cardfee cardtenure cardtenurecat card2 card2type card2benefit card2fee card2tenure card2tenurecat active bfast tenure churn longmon lnlongmon longten lnlongten tollfree tollmon lntollmon tollten lntollten equip equipmon lnequipmon equipten lnequipten callcard cardmon lncardmon cardten lncardten wireless wiremon lnwiremon wireten lnwireten multline voice pager internet callid callwait forward confer ebill owntv hourstv ownvcr owndvd owncd ownpda ownpc ownipod owngame ownfax news response_01 response_02 response_03 totalspend pred_total_spend
0 1 2 1 20 2 September 15 3 1 1 0 1 0 31 3.433987 2 11.1 1.200909 0.183079 2.240091 0.806516 1 1 0 -1 -1 3 0 0 0 0 0 0 0 0 0 2 0 1 2 1 0 14.3 1 0 0 8 4 22.0 0 1 1 0 0 0 0 1 0 0 9 6 1 0 1 3 1 1 0 2 2 5 3 1 0 3 2 0 3 5 1 6.50 1.871802 34.40 3.538057 1 29.0 3.367296 161.05 5.081715 1 29.50 3.384390 126.1 4.837075 1 14.25 2.656757 60.0 4.094345 0 0.00 NaN 0.00 NaN 1 1 1 0 0 1 1 1 0 1 13 1 1 0 0 0 1 1 0 0 0 1 0 149.46 372.303627
1 5 5 0 22 2 May 17 4 2 0 0 1 0 15 2.708050 1 18.6 1.222020 0.200505 1.567980 0.449788 1 1 0 -1 -1 2 6 0 0 0 0 0 0 6 1 3 2 1 2 1 1 6.8 1 0 0 1 1 29.0 1 0 0 1 0 0 1 0 1 1 9 4 1 0 0 2 4 1 0 4 2 4 1 3 0 4 2 1 1 39 0 8.90 2.186051 330.60 5.800909 0 0.0 NaN 0.00 NaN 1 54.85 4.004602 1975.0 7.588324 1 16.00 2.772589 610.0 6.413459 1 45.65 3.821004 1683.55 7.428660 1 1 1 4 1 0 1 0 1 1 18 1 1 1 1 1 1 1 1 1 0 0 0 77.54 302.061632
2 3 4 1 67 6 June 14 2 2 0 16 5 0 35 3.555348 2 9.9 0.928620 -0.074056 2.536380 0.930738 0 4 1 13 2 3 3 2 1 0 0 0 0 0 1 1 30 5 3 1 1 18.8 1 0 1 4 3 24.0 1 0 1 1 1 0 0 0 0 0 2 5 1 0 0 2 1 4 0 35 5 4 1 3 0 25 5 0 3 65 0 28.40 3.346389 1858.35 7.527444 0 0.0 NaN 0.00 NaN 0 0.00 NaN 0.0 NaN 1 23.00 3.135494 1410.0 7.251345 0 0.00 NaN 0.00 NaN 1 0 0 0 0 0 0 0 0 1 21 1 1 1 0 0 0 0 0 1 0 0 0 359.97 480.932947
3 4 3 0 23 2 May 16 3 2 0 0 1 0 20 2.995732 1 5.7 0.022800 -3.780995 1.117200 0.110826 1 2 1 18 4 5 0 0 0 0 0 0 0 0 1 3 3 2 3 1 1 8.7 1 0 1 1 1 38.0 1 0 0 0 0 0 0 0 0 0 9 3 0 0 0 2 1 4 0 5 2 3 2 4 0 5 2 1 1 36 0 6.00 1.791759 199.45 5.295564 0 0.0 NaN 0.00 NaN 0 0.00 NaN 0.0 NaN 1 21.00 3.044522 685.0 6.529419 0 0.00 NaN 0.00 NaN 1 0 0 2 0 0 0 0 1 1 26 1 1 1 0 1 1 1 0 1 1 0 0 359.41 271.020145
4 2 2 0 26 3 July 16 3 2 0 1 1 0 23 3.135494 1 1.7 0.214659 -1.538705 0.176341 -1.735336 0 1 1 13 2 4 0 0 0 0 0 0 0 0 0 2 3 2 1 0 1 10.6 1 0 1 6 3 32.0 0 0 0 0 0 1 0 1 0 0 9 4 0 0 0 4 2 1 0 8 3 1 3 2 0 9 3 1 3 21 0 3.05 1.115142 74.10 4.305416 1 16.5 2.803360 387.70 5.960232 0 0.00 NaN 0.0 NaN 1 17.25 2.847812 360.0 5.886104 1 19.05 2.947067 410.80 6.018106 0 1 0 3 1 1 1 1 0 1 27 1 1 1 0 1 0 1 0 0 0 1 0 507.83 336.914886
In [79]:
#Export the final result in csv.
%cd C:\\Users\\ashwini\\Desktop
testfile.to_csv('Final_submission_credit_spend.csv',index=False)
C:\Users\ashwini\Desktop
In [ ]:
##################End######################

Happy Leaarning..........